Skip to content

Add public generic methods to TextLoader catalog that accept Options objects #5134

New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Merged
merged 2 commits into from
May 16, 2020
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
27 changes: 18 additions & 9 deletions src/Microsoft.ML.Data/DataLoadSave/Text/TextLoader.cs
Original file line number Diff line number Diff line change
Expand Up @@ -1453,6 +1453,23 @@ internal static TextLoader CreateTextLoader<TInput>(IHostEnvironment host,
bool trimWhitespace = Defaults.TrimWhitespace,
IMultiStreamSource dataSample = null)
{
Options options = new Options
{
Copy link
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Is the plan to add readMultiLines as an option in a later PR?

Copy link
Member Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

readMultilines is already added to the TextLoader.Options class in my other PR, and it will always default as false, if its value isn't provided.

The method here, that you've pointed to, is used by this public method in the catalog. I can't add a new readMultilines parameter there precisely because it would be a breaking API change. So it doesn't make much sense to me to add a readMultilines option here, and so TextLoader.Options will simply use the default for it (which is false).

In the future, if a user (or we internally) want to use the new options (such as readMultiline), we would simply use a TextLoader.Options object and the overloads for creating TextLoaders that uses the option object. I think it's better and easier to maintain and update only the options object than an overload with several parameters that can't be changed in the public API.

HasHeader = hasHeader,
Separators = new[] { separator },
AllowQuoting = allowQuoting,
AllowSparse = supportSparse,
TrimWhitespace = trimWhitespace
};

return CreateTextLoader<TInput>(host, options, dataSample);
}

internal static TextLoader CreateTextLoader<TInput>(IHostEnvironment host,
Options options = null,
IMultiStreamSource dataSample = null)
{
options = options ?? new Options();
var userType = typeof(TInput);

var fieldInfos = userType.GetFields(BindingFlags.Public | BindingFlags.Instance);
Expand Down Expand Up @@ -1506,15 +1523,7 @@ internal static TextLoader CreateTextLoader<TInput>(IHostEnvironment host,
columns.Add(column);
}

Options options = new Options
{
HasHeader = hasHeader,
Separators = new[] { separator },
AllowQuoting = allowQuoting,
AllowSparse = supportSparse,
TrimWhitespace = trimWhitespace,
Columns = columns.ToArray()
};
options.Columns = columns.ToArray();

return new TextLoader(host, options, dataSample: dataSample);
}
Expand Down
63 changes: 49 additions & 14 deletions src/Microsoft.ML.Data/DataLoadSave/Text/TextLoaderSaverCatalog.cs
Original file line number Diff line number Diff line change
Expand Up @@ -97,6 +97,19 @@ public static TextLoader CreateTextLoader<TInput>(this DataOperationsCatalog cat
=> TextLoader.CreateTextLoader<TInput>(CatalogUtils.GetEnvironment(catalog), hasHeader, separatorChar, allowQuoting,
allowSparse, trimWhitespace, dataSample: dataSample);

/// <summary>
/// Create a text loader <see cref="TextLoader"/> by inferencing the dataset schema from a data model type.
/// </summary>
/// <param name="catalog">The <see cref="DataOperationsCatalog"/> catalog.</param>
/// <param name="options">Defines the settings of the load operation. Defines the settings of the load operation. No need to specify a Columns field,
/// as columns will be infered by this method.</param>
/// <param name="dataSample">The optional location of a data sample. The sample can be used to infer information
/// about the columns, such as slot names.</param>
public static TextLoader CreateTextLoader<TInput>(this DataOperationsCatalog catalog,
TextLoader.Options options,
IMultiStreamSource dataSample = null)
=> TextLoader.CreateTextLoader<TInput>(CatalogUtils.GetEnvironment(catalog), options, dataSample);

/// <summary>
/// Load a <see cref="IDataView"/> from a text file using <see cref="TextLoader"/>.
/// Note that <see cref="IDataView"/>'s are lazy, so no actual loading happens here, just schema validation.
Expand Down Expand Up @@ -143,6 +156,35 @@ public static IDataView LoadFromTextFile(this DataOperationsCatalog catalog,
return loader.Load(new MultiFileSource(path));
}

/// <summary>
/// Load a <see cref="IDataView"/> from a text file using <see cref="TextLoader"/>.
/// Note that <see cref="IDataView"/>'s are lazy, so no actual loading happens here, just schema validation.
/// </summary>
/// <param name="catalog">The <see cref="DataOperationsCatalog"/> catalog.</param>
/// <param name="path">Specifies a file from which to load.</param>
/// <param name="options">Defines the settings of the load operation.</param>
/// <example>
/// <format type="text/markdown">
/// <![CDATA[
/// [!code-csharp[LoadFromTextFile](~/../docs/samples/docs/samples/Microsoft.ML.Samples/Dynamic/DataOperations/SaveAndLoadFromText.cs)]
/// ]]>
/// </format>
/// </example>
public static IDataView LoadFromTextFile(this DataOperationsCatalog catalog, string path,
TextLoader.Options options = null)
{
Contracts.CheckNonEmpty(path, nameof(path));
if (!File.Exists(path))
{
throw Contracts.ExceptParam(nameof(path), "File does not exist at path: {0}", path);
}

var env = catalog.GetEnvironment();
var source = new MultiFileSource(path);

return new TextLoader(env, options, dataSample: source).Load(source);
}

/// <summary>
/// Load a <see cref="IDataView"/> from a text file using <see cref="TextLoader"/>.
/// Note that <see cref="IDataView"/>'s are lazy, so no actual loading happens here, just schema validation.
Expand Down Expand Up @@ -191,27 +233,20 @@ public static IDataView LoadFromTextFile<TInput>(this DataOperationsCatalog cata
/// </summary>
/// <param name="catalog">The <see cref="DataOperationsCatalog"/> catalog.</param>
/// <param name="path">Specifies a file from which to load.</param>
/// <param name="options">Defines the settings of the load operation.</param>
/// <example>
/// <format type="text/markdown">
/// <![CDATA[
/// [!code-csharp[LoadFromTextFile](~/../docs/samples/docs/samples/Microsoft.ML.Samples/Dynamic/DataOperations/SaveAndLoadFromText.cs)]
/// ]]>
/// </format>
/// </example>
public static IDataView LoadFromTextFile(this DataOperationsCatalog catalog, string path,
TextLoader.Options options = null)
/// <param name="options">Defines the settings of the load operation. No need to specify a Columns field,
/// as columns will be infered by this method.</param>
/// <returns>The data view.</returns>
public static IDataView LoadFromTextFile<TInput>(this DataOperationsCatalog catalog, string path,
TextLoader.Options options)
{
Contracts.CheckNonEmpty(path, nameof(path));
if (!File.Exists(path))
{
throw Contracts.ExceptParam(nameof(path), "File does not exist at path: {0}", path);
}

var env = catalog.GetEnvironment();
var source = new MultiFileSource(path);

return new TextLoader(env, options, dataSample: source).Load(source);
return TextLoader.CreateTextLoader<TInput>(CatalogUtils.GetEnvironment(catalog), options)
.Load(new MultiFileSource(path));
}

/// <summary>
Expand Down
9 changes: 7 additions & 2 deletions test/Microsoft.ML.Functional.Tests/Prediction.cs
Original file line number Diff line number Diff line change
Expand Up @@ -36,9 +36,14 @@ public void ReconfigurablePrediction()
{
var mlContext = new MLContext(seed: 1);

var options = new TextLoader.Options
{
HasHeader = TestDatasets.Sentiment.fileHasHeader,
Separators = new[] { TestDatasets.Sentiment.fileSeparator }
};

var data = mlContext.Data.LoadFromTextFile<TweetSentiment>(TestCommon.GetDataPath(DataDir, TestDatasets.Sentiment.trainFilename),
hasHeader: TestDatasets.Sentiment.fileHasHeader,
separatorChar: TestDatasets.Sentiment.fileSeparator);
options);

// Create a training pipeline.
var pipeline = mlContext.Transforms.Text.FeaturizeText("Features", "SentimentText")
Expand Down
27 changes: 22 additions & 5 deletions test/Microsoft.ML.Tests/TextLoaderTests.cs
Original file line number Diff line number Diff line change
Expand Up @@ -704,8 +704,10 @@ public class IrisColumnIndices
public string Type;
}

[Fact]
public void LoaderColumnsFromIrisData()
[Theory]
[InlineData(true)]
[InlineData(false)]
public void LoaderColumnsFromIrisData(bool useOptionsObject)
{
var dataPath = GetDataPath(TestDatasets.irisData.trainFilename);
var mlContext = new MLContext(1);
Expand All @@ -719,7 +721,12 @@ public void LoaderColumnsFromIrisData()
var irisFirstRowValues = irisFirstRow.Values.GetEnumerator();

// Simple load
var dataIris = mlContext.Data.CreateTextLoader<Iris>(separatorChar: ',').Load(dataPath);
IDataView dataIris;
if (useOptionsObject)
dataIris = mlContext.Data.CreateTextLoader<Iris>(new TextLoader.Options() { Separator = ",", AllowQuoting = false }).Load(dataPath);
else
dataIris = mlContext.Data.CreateTextLoader<Iris>(separatorChar: ',').Load(dataPath);

var previewIris = dataIris.Preview(1);

Assert.Equal(5, previewIris.ColumnView.Length);
Expand All @@ -735,7 +742,12 @@ public void LoaderColumnsFromIrisData()
Assert.Equal("Iris-setosa", previewIris.RowView[0].Values[index].Value.ToString());

// Load with start and end indexes
var dataIrisStartEnd = mlContext.Data.CreateTextLoader<IrisStartEnd>(separatorChar: ',').Load(dataPath);
IDataView dataIrisStartEnd;
if (useOptionsObject)
dataIrisStartEnd = mlContext.Data.CreateTextLoader<IrisStartEnd>(new TextLoader.Options() { Separator = ",", AllowQuoting = false }).Load(dataPath);
else
dataIrisStartEnd = mlContext.Data.CreateTextLoader<IrisStartEnd>(separatorChar: ',').Load(dataPath);

var previewIrisStartEnd = dataIrisStartEnd.Preview(1);

Assert.Equal(2, previewIrisStartEnd.ColumnView.Length);
Expand All @@ -752,7 +764,12 @@ public void LoaderColumnsFromIrisData()
}

// load setting the distinct columns. Loading column 0 and 2
var dataIrisColumnIndices = mlContext.Data.CreateTextLoader<IrisColumnIndices>(separatorChar: ',').Load(dataPath);
IDataView dataIrisColumnIndices;
if (useOptionsObject)
dataIrisColumnIndices = mlContext.Data.CreateTextLoader<IrisColumnIndices>(new TextLoader.Options() { Separator = ",", AllowQuoting = false }).Load(dataPath);
else
dataIrisColumnIndices = mlContext.Data.CreateTextLoader<IrisColumnIndices>(separatorChar: ',').Load(dataPath);

var previewIrisColumnIndices = dataIrisColumnIndices.Preview(1);

Assert.Equal(2, previewIrisColumnIndices.ColumnView.Length);
Expand Down