-
Notifications
You must be signed in to change notification settings - Fork 1.9k
Add V1 Scenario tests for data transformation #2803
New issue
Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.
By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.
Already on GitHub? Sign in to your account
Changes from all commits
b4f9a6e
bc6955a
2c1af9e
bc96109
3e68c5d
File filter
Filter by extension
Conversations
Jump to
Diff view
Diff view
There are no files selected for viewing
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,198 @@ | ||
// Licensed to the .NET Foundation under one or more agreements. | ||
// The .NET Foundation licenses this file to you under the MIT license. | ||
// See the LICENSE file in the project root for more information. | ||
|
||
using System; | ||
using Microsoft.ML.Functional.Tests.Datasets; | ||
using Microsoft.ML.RunTests; | ||
using Microsoft.ML.TestFramework; | ||
using Microsoft.ML.Trainers; | ||
using Microsoft.ML.Transforms; | ||
using Microsoft.ML.Transforms.Text; | ||
using Xunit; | ||
using Xunit.Abstractions; | ||
|
||
namespace Microsoft.ML.Functional.Tests | ||
{ | ||
public class DataTransformation : BaseTestClass | ||
{ | ||
public DataTransformation(ITestOutputHelper output) : base(output) | ||
{ | ||
} | ||
|
||
/// <summary> | ||
/// Extensibility: Add a new column that is a function of other columns. | ||
/// </summary> | ||
[Fact] | ||
void ExtensibilityAddAColumnAsAFunctionOfMultipleColumns() | ||
{ | ||
// Concurrency must be 1 to assure that the mapping is done sequentially. | ||
var mlContext = new MLContext(seed: 1, conc: 1); | ||
|
||
// Load the Iris dataset | ||
var data = mlContext.Data.LoadFromTextFile<Iris>( | ||
GetDataPath(TestDatasets.iris.trainFilename), | ||
hasHeader: TestDatasets.iris.fileHasHeader, | ||
separatorChar: TestDatasets.iris.fileSeparator); | ||
|
||
// Subsample it down to the first 10 rows. | ||
int numSamples = 10; | ||
data = mlContext.Data.TakeRows(data, numSamples); | ||
|
||
// Create a stand-alone function to produce a random number. | ||
float angiospermCosine(float petalWidth, float petalLength, float sepalWidth, float sepalLength) | ||
{ | ||
var petalMagnitude = Math.Sqrt(petalWidth * petalWidth + petalLength * petalLength); | ||
var sepalMagnitude = Math.Sqrt(sepalWidth * sepalWidth + sepalLength * sepalLength); | ||
return (float)((petalWidth * sepalWidth + petalLength * sepalLength) / (petalMagnitude * sepalMagnitude)); | ||
} | ||
|
||
// Create a function that generates a column. | ||
Action<Iris, IrisWithOneExtraColumn> generateGroupId = (input, output) => | ||
{ | ||
output.Label = input.Label; | ||
output.Float1 = angiospermCosine(input.PetalLength, input.PetalWidth, input.SepalLength, input.SepalWidth); | ||
output.PetalLength = input.PetalLength; | ||
output.PetalWidth = input.PetalWidth; | ||
output.SepalLength = input.SepalLength; | ||
output.SepalWidth = input.SepalWidth; | ||
}; | ||
|
||
// Create a pipeline to execute the custom function. | ||
var pipeline = mlContext.Transforms.CustomMapping(generateGroupId, null); | ||
|
||
// Transform the data. | ||
var transformedData = pipeline.Fit(data).Transform(data); | ||
|
||
// Verify that the column has the correct data. | ||
var transformedRows = mlContext.Data.CreateEnumerable<IrisWithOneExtraColumn>(transformedData, reuseRowObject: true); | ||
foreach (var row in transformedRows) | ||
{ | ||
var cosineDistance = angiospermCosine(row.PetalLength, row.PetalWidth, row.SepalLength, row.SepalWidth); | ||
Assert.Equal(cosineDistance, row.Float1); | ||
} | ||
} | ||
|
||
/// <summary> | ||
/// Extensibility: Add multiple new columns. | ||
/// </summary> | ||
[Fact] | ||
void ExtensibilityAddingTwoColumns() | ||
{ | ||
// Concurrency must be 1 to assure that the mapping is done sequentially. | ||
var mlContext = new MLContext(seed: 1, conc: 1); | ||
|
||
// Load the Iris dataset | ||
var data = mlContext.Data.LoadFromTextFile<Iris>( | ||
GetDataPath(TestDatasets.iris.trainFilename), | ||
hasHeader: TestDatasets.iris.fileHasHeader, | ||
separatorChar: TestDatasets.iris.fileSeparator); | ||
|
||
// Subsample it down to the first 10 rows. | ||
int numSamples = 10; | ||
data = mlContext.Data.TakeRows(data, numSamples); | ||
|
||
// Create a function that generates a column. | ||
Action<Iris, IrisWithTwoExtraColumns> generateGroupId = (input, output) => | ||
{ | ||
output.Label = input.Label; | ||
output.Float1 = GetRandomNumber(1 + input.Label + input.PetalLength + input.PetalWidth + input.SepalLength + input.SepalWidth); | ||
output.Float2 = GetRandomNumber(2 + input.Label + input.PetalLength + input.PetalWidth + input.SepalLength + input.SepalWidth); | ||
output.PetalLength = input.PetalLength; | ||
output.PetalWidth = input.PetalWidth; | ||
output.SepalLength = input.SepalLength; | ||
output.SepalWidth = input.SepalWidth; | ||
}; | ||
|
||
// Create a pipeline to execute the custom function. | ||
var pipeline = mlContext.Transforms.CustomMapping(generateGroupId, null); | ||
|
||
// Transform the data. | ||
var transformedData = pipeline.Fit(data).Transform(data); | ||
|
||
// Verify that the column has the correct data. | ||
var transformedRows = mlContext.Data.CreateEnumerable<IrisWithTwoExtraColumns>(transformedData, reuseRowObject: true); | ||
foreach (var row in transformedRows) | ||
{ | ||
var randomNumber1 = GetRandomNumber(1 + row.Label + row.PetalLength + row.PetalWidth + row.SepalLength + row.SepalWidth); | ||
var randomNumber2 = GetRandomNumber(2 + row.Label + row.PetalLength + row.PetalWidth + row.SepalLength + row.SepalWidth); | ||
Assert.Equal(randomNumber1, row.Float1); | ||
Assert.Equal(randomNumber2, row.Float2); | ||
} | ||
} | ||
|
||
/// <summary> | ||
/// Extensibility: Featurize text using custom word-grams, char-grams, and normalization. | ||
/// </summary> | ||
[Fact] | ||
void ExtensibilityModifyTextFeaturization() | ||
{ | ||
// Concurrency must be 1 to assure that the mapping is done sequentially. | ||
var mlContext = new MLContext(seed: 1, conc: 1); | ||
|
||
var data = mlContext.Data.LoadFromTextFile<TweetSentiment>(GetDataPath(TestDatasets.Sentiment.trainFilename), | ||
hasHeader: TestDatasets.Sentiment.fileHasHeader, | ||
separatorChar: TestDatasets.Sentiment.fileSeparator); | ||
|
||
// Create a training pipeline. | ||
// TODO #2802: Update FeaturizeText to allow specifications of word-grams and char-grams. | ||
var pipeline = mlContext.Transforms.Text.FeaturizeText("Features", new string[] { "SentimentText" }, | ||
new TextFeaturizingEstimator.Options | ||
{ | ||
UseCharExtractor = true, | ||
UseWordExtractor = true, | ||
VectorNormalizer = TextFeaturizingEstimator.TextNormKind.L1 | ||
}) | ||
.AppendCacheCheckpoint(mlContext) | ||
.Append(mlContext.BinaryClassification.Trainers.StochasticDualCoordinateAscent( | ||
new SdcaBinaryTrainer.Options { NumThreads = 1 })); | ||
|
||
// Train the model. | ||
var model = pipeline.Fit(data); | ||
|
||
// Evaluate the model. | ||
var scoredData = model.Transform(data); | ||
var metrics = mlContext.BinaryClassification.Evaluate(scoredData); | ||
|
||
// Check that the metrics returned are valid. | ||
Common.AssertMetrics(metrics); | ||
} | ||
|
||
/// <summary> | ||
/// Extensibility: Apply a normalizer to columns in the dataset. | ||
/// </summary> | ||
[Fact] | ||
void ExtensibilityNormalizeColumns() | ||
{ | ||
// Concurrency must be 1 to assure that the mapping is done sequentially. | ||
var mlContext = new MLContext(seed: 1, conc: 1); | ||
|
||
// Load the Iris dataset. | ||
var data = mlContext.Data.LoadFromTextFile<Iris>( | ||
GetDataPath(TestDatasets.iris.trainFilename), | ||
hasHeader: TestDatasets.iris.fileHasHeader, | ||
separatorChar: TestDatasets.iris.fileSeparator); | ||
|
||
// Compose the transformation. | ||
var pipeline = mlContext.Transforms.Concatenate("Features", Iris.Features) | ||
.Append(mlContext.Transforms.Normalize("Features", mode: NormalizingEstimator.NormalizerMode.MinMax)); | ||
|
||
// Transform the data. | ||
var transformedData = pipeline.Fit(data).Transform(data); | ||
|
||
// Validate that the data was normalized to between -1 and 1. | ||
var dataEnumerator = mlContext.Data.CreateEnumerable<FeatureColumn>(transformedData, true); | ||
foreach (var row in dataEnumerator) | ||
// Verify per-slot normalization. | ||
for (int i = 0; i < row.Features.Length; i++) | ||
Assert.InRange(row.Features[i], -1, 1); | ||
} | ||
|
||
private float GetRandomNumber(float number) | ||
{ | ||
var seed = (int)(10 * number); | ||
var rng = new Random(seed); | ||
return (float)rng.NextDouble(); | ||
} | ||
} | ||
} |
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,14 @@ | ||
// Licensed to the .NET Foundation under one or more agreements. | ||
// The .NET Foundation licenses this file to you under the MIT license. | ||
// See the LICENSE file in the project root for more information. | ||
|
||
namespace Microsoft.ML.Functional.Tests.Datasets | ||
{ | ||
/// <summary> | ||
/// A class to hold a feature column. | ||
/// </summary> | ||
internal sealed class FeatureColumn | ||
{ | ||
public float[] Features { get; set; } | ||
} | ||
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. maybe move it to the other file, the Iris. #ByDesign There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. This is designed to be generic for tests against any features columns, not just the iris, so I'll keep it in its own file for now, similar to `FeatureContributionOutput'. In reply to: 261830507 [](ancestors = 261830507) |
||
} |
This file was deleted.
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
Could you change summary tag to match the test? #Resolved