Skip to content

support sweeping multiline option in AutoML #5148

New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Merged
merged 9 commits into from
May 21, 2020
Prev Previous commit
Next Next commit
add test for AutoML inferColumn API
  • Loading branch information
LittleLittleCloud committed May 20, 2020
commit e7cb2b50c8e0abd870adf1fa9cf4dd0e919504cc
3 changes: 3 additions & 0 deletions src/Microsoft.ML.AutoML/ColumnInference/ColumnInferenceApi.cs
Original file line number Diff line number Diff line change
Expand Up @@ -55,6 +55,7 @@ public static ColumnInferenceResults InferColumns(MLContext context, string path
Separators = new[] { splitInference.Separator.Value },
AllowSparse = splitInference.AllowSparse,
AllowQuoting = splitInference.AllowQuote,
ReadMultilines = splitInference.ReadMultilines,
HasHeader = hasHeader,
TrimWhitespace = trimWhitespace
};
Expand Down Expand Up @@ -91,6 +92,7 @@ public static ColumnInferenceResults InferColumns(MLContext context, string path
AllowQuoting = splitInference.AllowQuote,
AllowSparse = splitInference.AllowSparse,
Separators = new char[] { splitInference.Separator.Value },
ReadMultilines = splitInference.ReadMultilines,
HasHeader = hasHeader,
TrimWhitespace = trimWhitespace
};
Expand Down Expand Up @@ -139,6 +141,7 @@ private static ColumnTypeInference.InferenceResult InferColumnTypes(MLContext co
Separator = splitInference.Separator.Value,
AllowSparse = splitInference.AllowSparse,
AllowQuote = splitInference.AllowQuote,
ReadMultilines = splitInference.ReadMultilines,
HasHeader = hasHeader,
LabelColumnIndex = labelColumnIndex,
Label = label
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -32,6 +32,7 @@ internal sealed class Arguments
public int MaxRowsToRead;
public uint? LabelColumnIndex;
public string Label;
public bool ReadMultilines;

public Arguments()
{
Expand Down Expand Up @@ -262,6 +263,7 @@ private static InferenceResult InferTextFileColumnTypesCore(MLContext context, I
Separators = new[] { args.Separator },
AllowSparse = args.AllowSparse,
AllowQuoting = args.AllowQuote,
ReadMultilines = args.ReadMultilines,
};
var textLoader = context.Data.CreateTextLoader(textLoaderOptions);
var idv = textLoader.Load(fileSource);
Expand Down
8 changes: 5 additions & 3 deletions src/Microsoft.ML.AutoML/ColumnInference/TextFileContents.cs
Original file line number Diff line number Diff line change
Expand Up @@ -23,14 +23,16 @@ public class ColumnSplitResult

public bool AllowQuote { get; set; }
public bool AllowSparse { get; set; }
public bool ReadMultilines { get; set; }

public ColumnSplitResult(bool isSuccess, char? separator, bool allowQuote, bool allowSparse, int columnCount)
public ColumnSplitResult(bool isSuccess, char? separator, bool allowQuote, bool readMultilines, bool allowSparse, int columnCount)
Copy link
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Following naming convention would be:

Suggested change
public ColumnSplitResult(bool isSuccess, char? separator, bool allowQuote, bool readMultilines, bool allowSparse, int columnCount)
public ColumnSplitResult(bool isSuccess, char? separator, bool allowQuote, bool allowMultiline, bool allowSparse, int columnCount)

Copy link
Contributor Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

TextLoader Option uses ReadMultilines (So is allowQuote and allowSparse), and we'd better follow their naming style.

Copy link
Contributor Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Resolved

{
IsSuccess = isSuccess;
Separator = separator;
AllowQuote = allowQuote;
AllowSparse = allowSparse;
ColumnCount = columnCount;
ReadMultilines = readMultilines;
}
}

Expand Down Expand Up @@ -78,7 +80,7 @@ from _tryMultiline in tryMultiline
break;
}
}
return foundAny ? result : new ColumnSplitResult(false, null, true, true, 0);
return foundAny ? result : new ColumnSplitResult(false, null, true, true, true, 0);
}

private static bool TryParseFile(MLContext context, TextLoader.Options options, IMultiStreamSource source,
Expand Down Expand Up @@ -114,7 +116,7 @@ private static bool TryParseFile(MLContext context, TextLoader.Options options,
// disallow single-column case
if (mostCommon.Key <= 1) { return false; }

result = new ColumnSplitResult(true, options.Separators.First(), options.AllowQuoting, options.AllowSparse, mostCommon.Key);
result = new ColumnSplitResult(true, options.Separators.First(), options.AllowQuoting, options.ReadMultilines, options.AllowSparse, mostCommon.Key);
return true;
}
// fail gracefully if unable to instantiate data view with swept arguments
Expand Down
21 changes: 21 additions & 0 deletions test/Microsoft.ML.AutoML.Tests/ColumnInferenceTests.cs
Original file line number Diff line number Diff line change
Expand Up @@ -200,5 +200,26 @@ public void TrySplitColumns_should_split_on_dataset_with_newline_between_double_
result.Separator.Should().Be(',');
result.IsSuccess.Should().BeTrue();
}

[Fact]
public void InferColumnsFromMultilineInputFile()
{
// Check if we can infer the column information
// from and input file which has escaped newlines inside quotes
var dataPath = GetDataPath("multiline.csv");
MLContext mlContext = new MLContext();
var inputColumnInformation = new ColumnInformation();
inputColumnInformation.LabelColumnName = @"id";
var result = mlContext.Auto().InferColumns(dataPath, inputColumnInformation);

// File only have 3 columns: "id", "description" and "animal"
Assert.NotNull(result.ColumnInformation.LabelColumnName);
Assert.Equal(1, result.ColumnInformation.TextColumnNames.Count);
Assert.Equal(1, result.ColumnInformation.CategoricalColumnNames.Count);

Assert.Equal("id", result.ColumnInformation.LabelColumnName);
Assert.Equal("description", result.ColumnInformation.TextColumnNames.First());
Assert.Equal("animal", result.ColumnInformation.CategoricalColumnNames.First());
}
}
}