Skip to content

Enable TextLoader to accept new lines in quoted fields #5125

New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Merged
merged 32 commits into from
May 19, 2020
Merged
Show file tree
Hide file tree
Changes from 1 commit
Commits
Show all changes
32 commits
Select commit Hold shift + click to select a range
e03b0e4
fix textloader bug on quotes
LittleLittleCloud Dec 17, 2019
3ddb3b0
use static method instead of extension method
LittleLittleCloud Dec 19, 2019
9b20ead
better name
LittleLittleCloud Dec 19, 2019
34d9efa
Merge branch 'master' into u/xiaoyun/fixTextLoaderBug
LittleLittleCloud Apr 15, 2020
ac6f971
Merge remote-tracking branch 'xiaoyun/u/xiaoyun/fixTextLoaderBug' int…
antoniovs1029 May 11, 2020
8304d62
Moved multiline reader to TextLoaderCursor, and added return when mul…
antoniovs1029 May 13, 2020
a9e91e2
Fix issue with empty unquoted new lines
antoniovs1029 May 13, 2020
9cfaee1
Make ReadMultiLine a little bit more efficient
antoniovs1029 May 13, 2020
2827c61
Add clarifying comment in FetchNextField
antoniovs1029 May 13, 2020
a427662
Added test
antoniovs1029 May 13, 2020
1cafbf0
Added test for column inference
antoniovs1029 May 14, 2020
6116d97
Make multilinereader a little bit more efficient
antoniovs1029 May 14, 2020
df2ca25
Create read multilines parameter
antoniovs1029 May 14, 2020
530f41e
Make tests run with readMultilines parameter
antoniovs1029 May 14, 2020
d9af9d2
Remove public parameter for readMultilines, as it is considered a bre…
antoniovs1029 May 15, 2020
6a7b632
Update manifest
antoniovs1029 May 15, 2020
fb6ab28
Update src/Microsoft.ML.Data/DataLoadSave/Text/TextLoaderCursor.cs
antoniovs1029 May 15, 2020
f592a7f
Update src/Microsoft.ML.Data/DataLoadSave/Text/TextLoaderParser.cs
antoniovs1029 May 15, 2020
a13b803
Removed new readMultiline option from AutoML.NET and removed the test…
antoniovs1029 May 15, 2020
c2d2ac7
Merge branch 'is4460TextLoaderQuoting' of https://github.com/antoniov…
antoniovs1029 May 15, 2020
13033bf
Actually add new line characters to loaded strings
antoniovs1029 May 16, 2020
2983b06
Actually include new line characters in loaded strings
antoniovs1029 May 16, 2020
9789479
Let the TextSaver also correctly save new lines inside quoted fields
antoniovs1029 May 18, 2020
60c4169
Fixed bug when calling GetSomeLines
antoniovs1029 May 18, 2020
fa28ddd
Throw exception when reaching EOF without ending quote on a given field.
antoniovs1029 May 19, 2020
56279b4
Refactored readMultilines into OptionsFlags
antoniovs1029 May 19, 2020
51d9390
Added test to check new readMultiline option from MAML.
antoniovs1029 May 19, 2020
5cc7512
Merge remote-tracking branch 'upstream/master' into is4460TextLoaderQ…
antoniovs1029 May 19, 2020
5be5f6f
Fixed mistake on an unrelated test
antoniovs1029 May 19, 2020
b4e3029
Added internal default to ReadMultilines
antoniovs1029 May 19, 2020
7e16fc7
Do more checking in MultilineReader in order to be more flexible in a…
antoniovs1029 May 19, 2020
f0652a5
Updated tests
antoniovs1029 May 19, 2020
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
Prev Previous commit
Next Next commit
Throw exception when reaching EOF without ending quote on a given field.
  • Loading branch information
antoniovs1029 committed May 19, 2020
commit fa28ddd4e4be8533d0fc457b49550b0f339056b8
8 changes: 4 additions & 4 deletions src/Microsoft.ML.Data/DataLoadSave/Text/TextLoaderCursor.cs
Original file line number Diff line number Diff line change
Expand Up @@ -472,7 +472,7 @@ private static class MultiLineReader
// When reading lines that contain quoted fields, the quoted fields can contain
// '\n' so we we'll need to read multiple lines (multilines) to get all the fields
// of a given row.
public static string ReadMultiLine(TextReader sr, StringBuilder sb, bool ignoreHashLine)
public static string ReadMultiLine(TextReader sr, StringBuilder sb, long lineNum, bool ignoreHashLine)
{
string line;
line = sr.ReadLine();
Expand Down Expand Up @@ -503,7 +503,7 @@ public static string ReadMultiLine(TextReader sr, StringBuilder sb, bool ignoreH
line = sr.ReadLine();

if (line == null) // If we've reached the end of the file
break; // MYTODO: This could happen if we have an invalid open quote which never closes so we reach the end of the file without properly closing the field, should we throw instead in this case?
throw new EndOfStreamException($"A quote opened on a field on line {lineNum} was never closed, and we've read to the last line in the file without finding the closing quote");

sb.Append("\n");
sb.Append(line);
Expand Down Expand Up @@ -549,7 +549,7 @@ private void ThreadProc()
// introducing a CharSpan type (similar to ReadOnlyMemory but based on char[] or StringBuilder)
// and implementing all the necessary conversion functionality on it. See task 3871.
if (_readMultilines)
text = MultiLineReader.ReadMultiLine(rdr, multilineSB, true);
text = MultiLineReader.ReadMultiLine(rdr, multilineSB, line, true);
else
text = rdr.ReadLine();

Expand Down Expand Up @@ -580,7 +580,7 @@ private void ThreadProc()
return;

if (_readMultilines)
text = MultiLineReader.ReadMultiLine(rdr, multilineSB, true);
text = MultiLineReader.ReadMultiLine(rdr, multilineSB, line, false);
else
text = rdr.ReadLine();

Expand Down
49 changes: 49 additions & 0 deletions test/Microsoft.ML.Tests/TextLoaderTests.cs
Original file line number Diff line number Diff line change
Expand Up @@ -1013,5 +1013,54 @@ public void TestLoadTextWithEscapedNewLines(bool useSaved)
}
}
}

[Fact]
public void TestInvalidMultilineCSVQuote()
{
var mlContext = new MLContext(seed: 1);

string badInputCsv =
"id,description,animal\n" +
"9,\"this is a quoted field correctly formatted\",cat\n" +
"10,\"this is a quoted field\nwithout closing quote,cat\n" +
"11,this field isn't quoted,dog\n" +
"12,this will reach the end of the file without finding a closing quote so it will throw,frog\n"
;

var filePath = GetOutputPath("multiline-invalid.csv");
File.WriteAllText(filePath, badInputCsv);

bool threwException = false;
try
{
var options = new TextLoader.Options()
{
HasHeader = true,
Separator = ",",
AllowQuoting = true,
ReadMultilines = true,
Columns = new[]
{
new TextLoader.Column("id", DataKind.Int32, 0),
new TextLoader.Column("description", DataKind.String, 1),
new TextLoader.Column("animal", DataKind.String, 2),
},
};

var data = mlContext.Data.LoadFromTextFile(filePath, options);

data.Preview();
}
catch(EndOfStreamException)
{
threwException = true;
}
catch(FormatException)
{
threwException = true;
}

Assert.True(threwException, "Invalid file should have thrown an exception");
}
}
}