Skip to content

Commit f67aab5

Browse files
authored
Add FixZero for LogMeanVariance normalizer (dotnet#3916)
1 parent 9a32f54 commit f67aab5

File tree

6 files changed

+265
-27
lines changed

6 files changed

+265
-27
lines changed

docs/samples/Microsoft.ML.Samples/Dynamic/Transforms/NormalizeLogMeanVariance.cs

Lines changed: 16 additions & 16 deletions
Original file line numberDiff line numberDiff line change
@@ -12,15 +12,15 @@ public class NormalizeLogMeanVariance
1212
{
1313
public static void Example()
1414
{
15-
// Create a new ML context, for ML.NET operations. It can be used for exception tracking and logging,
15+
// Create a new ML context, for ML.NET operations. It can be used for exception tracking and logging,
1616
// as well as the source of randomness.
1717
var mlContext = new MLContext();
1818
var samples = new List<DataPoint>()
1919
{
20-
new DataPoint(){ Features = new float[4] { 1, 1, 3, 0} },
21-
new DataPoint(){ Features = new float[4] { 2, 2, 2, 0} },
22-
new DataPoint(){ Features = new float[4] { 0, 0, 1, 0} },
23-
new DataPoint(){ Features = new float[4] {-1,-1,-1, 1} }
20+
new DataPoint(){ Features = new float[5] { 1, 1, 3, 0, float.MaxValue } },
21+
new DataPoint(){ Features = new float[5] { 2, 2, 2, 0, float.MinValue } },
22+
new DataPoint(){ Features = new float[5] { 0, 0, 1, 0, 0} },
23+
new DataPoint(){ Features = new float[5] {-1,-1,-1, 1, 1} }
2424
};
2525
// Convert training data to IDataView, the general data type used in ML.NET.
2626
var data = mlContext.Data.LoadFromEnumerable(samples);
@@ -41,19 +41,19 @@ public static void Example()
4141
foreach (var row in column)
4242
Console.WriteLine(string.Join(", ", row.Select(x => x.ToString("f4"))));
4343
// Expected output:
44-
// 0.1587, 0.1587, 0.8654, 0.0000
45-
// 0.8413, 0.8413, 0.5837, 0.0000
46-
// 0.0000, 0.0000, 0.0940, 0.0000
47-
// 0.0000, 0.0000, 0.0000, 0.0000
44+
// 0.1587, 0.1587, 0.8654, 0.0000, 0.8413
45+
// 0.8413, 0.8413, 0.5837, 0.0000, 0.0000
46+
// 0.0000, 0.0000, 0.0940, 0.0000, 0.0000
47+
// 0.0000, 0.0000, 0.0000, 0.0000, 0.1587
4848

4949
var columnFixZero = noCdfData.GetColumn<float[]>("Features").ToArray();
5050
foreach (var row in columnFixZero)
5151
Console.WriteLine(string.Join(", ", row.Select(x => x.ToString("f4"))));
5252
// Expected output:
53-
// 1.8854, 1.8854, 5.2970, 0.0000
54-
// 4.7708, 4.7708, 3.0925, 0.0000
55-
// -1.0000,-1.0000, 0.8879, 0.0000
56-
// 3.8854,-3.8854,-3.5213, 0.0000
53+
// 1.8854, 1.8854, 5.2970, 0.0000, 7670682000000000000000000000000000000.0000
54+
// 4.7708, 4.7708, 3.0925, 0.0000, -7670682000000000000000000000000000000.0000
55+
// -1.0000,-1.0000, 0.8879, 0.0000, -1.0000
56+
// -3.8854,-3.8854,-3.5213, 0.0000, -0.9775
5757

5858
// Let's get transformation parameters. Since we work with only one column we need to pass 0 as parameter for GetNormalizerModelParameters.
5959
// If we have multiple columns transformations we need to pass index of InputOutputColumnPair.
@@ -64,18 +64,18 @@ public static void Example()
6464
// ERF is https://en.wikipedia.org/wiki/Error_function.
6565
// Expected output:
6666
// The 1-index value in resulting array would be produce by:
67-
// y = 0.5* (1 + ERF((Math.Log(x)- 0.3465736) / (0.3465736 * sqrt(2)))
67+
// y = 0.5 * (1 + ERF((Math.Log(x) - 0.3465736) / (0.3465736 * sqrt(2)))
6868
var noCdfParams = normalizeNoCdfTransform.GetNormalizerModelParameters(0) as AffineNormalizerModelParameters<ImmutableArray<float>>;
6969
var offset = noCdfParams.Offset.Length == 0 ? 0 : noCdfParams.Offset[1];
7070
var scale = noCdfParams.Scale[1];
7171
Console.WriteLine($"The 1-index value in resulting array would be produce by: y = (x - ({offset})) * {scale}");
7272
// Expected output:
73-
// The 1-index value in resulting array would be produce by: y = (x - (2.88539)) * 0.3465736
73+
// The 1-index value in resulting array would be produce by: y = (x - (0.3465736)) * 2.88539
7474
}
7575

7676
private class DataPoint
7777
{
78-
[VectorType(4)]
78+
[VectorType(5)]
7979
public float[] Features { get; set; }
8080
}
8181
}
Lines changed: 82 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,82 @@
1+
using System;
2+
using System.Collections.Generic;
3+
using System.Collections.Immutable;
4+
using System.Linq;
5+
using Microsoft.ML;
6+
using Microsoft.ML.Data;
7+
using static Microsoft.ML.Transforms.NormalizingTransformer;
8+
9+
namespace Samples.Dynamic
10+
{
11+
public class NormalizeLogMeanVarianceFixZero
12+
{
13+
public static void Example()
14+
{
15+
// Create a new ML context, for ML.NET operations. It can be used for exception tracking and logging,
16+
// as well as the source of randomness.
17+
var mlContext = new MLContext();
18+
var samples = new List<DataPoint>()
19+
{
20+
new DataPoint(){ Features = new float[5] { 1, 1, 3, 0, float.MaxValue } },
21+
new DataPoint(){ Features = new float[5] { 2, 2, 2, 0, float.MinValue } },
22+
new DataPoint(){ Features = new float[5] { 0, 0, 1, 0, 0} },
23+
new DataPoint(){ Features = new float[5] {-1,-1,-1, 1, 1} }
24+
};
25+
// Convert training data to IDataView, the general data type used in ML.NET.
26+
var data = mlContext.Data.LoadFromEnumerable(samples);
27+
// NormalizeLogMeanVariance normalizes the data based on the computed mean and variance of the logarithm of the data.
28+
// Uses Cumulative distribution function as output.
29+
var normalize = mlContext.Transforms.NormalizeLogMeanVariance("Features", true, useCdf: true);
30+
31+
// NormalizeLogMeanVariance normalizes the data based on the computed mean and variance of the logarithm of the data.
32+
var normalizeNoCdf = mlContext.Transforms.NormalizeLogMeanVariance("Features", true, useCdf: false);
33+
34+
// Now we can transform the data and look at the output to confirm the behavior of the estimator.
35+
// This operation doesn't actually evaluate data until we read the data below.
36+
var normalizeTransform = normalize.Fit(data);
37+
var transformedData = normalizeTransform.Transform(data);
38+
var normalizeNoCdfTransform = normalizeNoCdf.Fit(data);
39+
var noCdfData = normalizeNoCdfTransform.Transform(data);
40+
var column = transformedData.GetColumn<float[]>("Features").ToArray();
41+
foreach (var row in column)
42+
Console.WriteLine(string.Join(", ", row.Select(x => x.ToString("f4"))));
43+
// Expected output:
44+
// 0.1587, 0.1587, 0.8654, 0.0000, 0.8413
45+
// 0.8413, 0.8413, 0.5837, 0.0000, 0.0000
46+
// 0.0000, 0.0000, 0.0940, 0.0000, 0.0000
47+
// 0.0000, 0.0000, 0.0000, 0.0000, 0.1587
48+
49+
var columnFixZero = noCdfData.GetColumn<float[]>("Features").ToArray();
50+
foreach (var row in columnFixZero)
51+
Console.WriteLine(string.Join(", ", row.Select(x => x.ToString("f4"))));
52+
// Expected output:
53+
// 2.0403, 2.0403, 4.0001, 0.0000, 5423991000000000000000000000000000000.0000
54+
// 4.0806, 4.0806, 2.6667, 0.0000,-5423991000000000000000000000000000000.0000
55+
// 0.0000, 0.0000, 1.3334, 0.0000, 0.0000
56+
// -2.0403,-2.0403,-1.3334, 0.0000, 0.0159
57+
58+
// Let's get transformation parameters. Since we work with only one column we need to pass 0 as parameter for GetNormalizerModelParameters.
59+
// If we have multiple columns transformations we need to pass index of InputOutputColumnPair.
60+
var transformParams = normalizeTransform.GetNormalizerModelParameters(0) as CdfNormalizerModelParameters<ImmutableArray<float>>;
61+
Console.WriteLine("The values in the column with index 1 in the resulting array would be produced by:");
62+
Console.WriteLine($"y = 0.5* (1 + ERF((Math.Log(x)- {transformParams.Mean[1]}) / ({transformParams.StandardDeviation[1]} * sqrt(2)))");
63+
64+
// ERF is https://en.wikipedia.org/wiki/Error_function.
65+
// Expected output:
66+
// The values in the column with index 1 in the resulting array would be produced by:
67+
// y = 0.5 * (1 + ERF((Math.Log(x) - 0.3465736) / (0.3465736 * sqrt(2)))
68+
var noCdfParams = normalizeNoCdfTransform.GetNormalizerModelParameters(0) as AffineNormalizerModelParameters<ImmutableArray<float>>;
69+
var offset = noCdfParams.Offset.Length == 0 ? 0 : noCdfParams.Offset[1];
70+
var scale = noCdfParams.Scale[1];
71+
Console.WriteLine($"The values in the column with index 1 in the resulting array would be produced by: y = (x - ({offset})) * {scale}");
72+
// Expected output:
73+
// The values in the column with index 1 in the resulting array would be produced by: y = (x - (0)) * 2.040279
74+
}
75+
76+
private class DataPoint
77+
{
78+
[VectorType(5)]
79+
public float[] Features { get; set; }
80+
}
81+
}
82+
}

src/Microsoft.ML.Data/Transforms/NormalizeColumnSng.cs

Lines changed: 2 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -1557,7 +1557,7 @@ public static IColumnFunctionBuilder Create(NormalizingEstimator.LogMeanVariance
15571557
{
15581558
var lim = column.MaximumExampleCount;
15591559
host.CheckUserArg(lim > 1, nameof(column.MaximumExampleCount), "Must be greater than 1");
1560-
return new MeanVarOneColumnFunctionBuilder(host, lim, false, getter, true, column.UseCdf);
1560+
return new MeanVarOneColumnFunctionBuilder(host, lim, column.EnsureZeroUntouched, getter, true, column.UseCdf);
15611561
}
15621562

15631563
protected override bool ProcessValue(in TFloat origVal)
@@ -1633,7 +1633,7 @@ public static IColumnFunctionBuilder Create(NormalizingEstimator.LogMeanVariance
16331633
var lim = column.MaximumExampleCount;
16341634
host.CheckUserArg(lim > 1, nameof(column.MaximumExampleCount), "Must be greater than 1");
16351635
var cv = srcType.Size;
1636-
return new MeanVarVecColumnFunctionBuilder(host, cv, lim, false, getter, true, column.UseCdf);
1636+
return new MeanVarVecColumnFunctionBuilder(host, cv, lim, column.EnsureZeroUntouched, getter, true, column.UseCdf);
16371637
}
16381638

16391639
protected override bool ProcessValue(in VBuffer<TFloat> buffer)

src/Microsoft.ML.Data/Transforms/Normalizer.cs

Lines changed: 11 additions & 4 deletions
Original file line numberDiff line numberDiff line change
@@ -42,7 +42,7 @@ namespace Microsoft.ML.Transforms
4242
/// The resulting NormalizingEstimator will normalize the data in one of the following ways based upon how it was created:
4343
/// * Min Max - A linear rescale that is based upon the minimum and maximum values for each row.
4444
/// * Mean Variance - Rescale each row to unit variance and, optionally, zero mean.
45-
/// * Log Mean Variance - Rescale each row to unit variance based on a log scale.
45+
/// * Log Mean Variance - Rescale each row to unit variance, optionally, zero mean based on computations in log scale.
4646
/// * Binning - Bucketizes the data in each row and performs a linear rescale based on the calculated bins.
4747
/// * Supervised Binning - Bucketize the data in each row and performas a linear rescale based on the calculated bins. The bin calculation is based on correlation of the Label column.
4848
///
@@ -54,6 +54,13 @@ namespace Microsoft.ML.Transforms
5454
/// With Min Max, the distribution depends on how far away the number is from 0, resulting in the number with the largest distance being mapped to 1 if its a positive number
5555
/// or -1 if its a negative number. The distance from 0 will affect the distribution with a majority of numbers that are closer together normalizing towards 0.
5656
///
57+
/// The equation for the output $y$ of applying both Mean Variance and Log Mean Variance on input $x$ without
58+
/// using the CDF option is: $y = (x - \text{offset}) \text{scale}$. Where offset and scale are computed during training.
59+
///
60+
/// Using the CDF option it is: $y = 0.5 * (1 + \text{ERF}((x - \text{mean}) / (\text{standard deviation} * sqrt(2)))$.
61+
/// Where ERF is the [Error Function](https://en.wikipedia.org/wiki/Error_function) used to approximate the CDF of a random variable assumed to
62+
/// normally distributed. The mean and standard deviation are computing during training.
63+
///
5764
/// To create this estimator use one of the following:
5865
/// * [NormalizeMinMax](xref:Microsoft.ML.NormalizationCatalog.NormalizeMinMax(Microsoft.ML.TransformsCatalog, System.String, System.String, System.Int64, System.Boolean))
5966
/// * [NormalizeMeanVariance](xref:Microsoft.ML.NormalizationCatalog.NormalizeMeanVariance(Microsoft.ML.TransformsCatalog,System.String,System.String,System.Int64,System.Boolean,System.Boolean))
@@ -183,13 +190,13 @@ internal override IColumnFunctionBuilder MakeBuilder(IHost host, int srcIndex, D
183190
}
184191

185192
[BestFriend]
186-
internal sealed class LogMeanVarianceColumnOptions : ColumnOptionsBase
193+
internal sealed class LogMeanVarianceColumnOptions : ControlZeroColumnOptionsBase
187194
{
188195
public readonly bool UseCdf;
189196

190197
public LogMeanVarianceColumnOptions(string outputColumnName, string inputColumnName = null,
191-
long maximumExampleCount = Defaults.MaximumExampleCount, bool useCdf = Defaults.LogMeanVarCdf)
192-
: base(outputColumnName, inputColumnName ?? outputColumnName, maximumExampleCount)
198+
long maximumExampleCount = Defaults.MaximumExampleCount, bool useCdf = Defaults.LogMeanVarCdf, bool fixZero = Defaults.EnsureZeroUntouched)
199+
: base(outputColumnName, inputColumnName ?? outputColumnName, maximumExampleCount, fixZero)
193200
{
194201
UseCdf = useCdf;
195202
}

src/Microsoft.ML.Transforms/NormalizerCatalog.cs

Lines changed: 49 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -141,7 +141,7 @@ public static NormalizingEstimator NormalizeLogMeanVariance(this TransformsCatal
141141
long maximumExampleCount = NormalizingEstimator.Defaults.MaximumExampleCount,
142142
bool useCdf = NormalizingEstimator.Defaults.LogMeanVarCdf)
143143
{
144-
var columnOptions = new NormalizingEstimator.LogMeanVarianceColumnOptions(outputColumnName, inputColumnName, maximumExampleCount, useCdf);
144+
var columnOptions = new NormalizingEstimator.LogMeanVarianceColumnOptions(outputColumnName, inputColumnName, maximumExampleCount, useCdf, false);
145145
return new NormalizingEstimator(CatalogUtils.GetEnvironment(catalog), columnOptions);
146146
}
147147

@@ -159,7 +159,54 @@ public static NormalizingEstimator NormalizeLogMeanVariance(this TransformsCatal
159159
bool useCdf = NormalizingEstimator.Defaults.LogMeanVarCdf) =>
160160
new NormalizingEstimator(CatalogUtils.GetEnvironment(catalog),
161161
columns.Select(column =>
162-
new NormalizingEstimator.LogMeanVarianceColumnOptions(column.OutputColumnName, column.InputColumnName, maximumExampleCount, useCdf)).ToArray());
162+
new NormalizingEstimator.LogMeanVarianceColumnOptions(column.OutputColumnName, column.InputColumnName, maximumExampleCount, useCdf, false)).ToArray());
163+
164+
/// <summary>
165+
/// Create a <see cref="NormalizingEstimator"/>, which normalizes based on the computed mean and variance of the logarithm of the data.
166+
/// </summary>
167+
/// <param name="catalog">The transform catalog</param>
168+
/// <param name="outputColumnName">Name of the column resulting from the transformation of <paramref name="inputColumnName"/>.
169+
/// The data type on this column is the same as the input column.</param>
170+
/// <param name="fixZero">Whether to map zero to zero, preserving sparsity.</param>
171+
/// <param name="inputColumnName">Name of the column to transform. If set to <see langword="null"/>, the value of the <paramref name="outputColumnName"/> will be used as source.
172+
/// The data type on this column should be <see cref="System.Single"/>, <see cref="System.Double"/> or a known-sized vector of those types.</param>
173+
/// <param name="maximumExampleCount">Maximum number of examples used to train the normalizer.</param>
174+
/// <param name="useCdf">Whether to use CDF as the output.</param>
175+
/// <example>
176+
/// <format type="text/markdown">
177+
/// <![CDATA[
178+
/// [!code-csharp[NormalizeLogMeanVariance](~/../docs/samples/docs/samples/Microsoft.ML.Samples/Dynamic/Transforms/NormalizeLogMeanVarianceFixZero.cs)]
179+
/// ]]>
180+
/// </format>
181+
/// </example>
182+
public static NormalizingEstimator NormalizeLogMeanVariance(this TransformsCatalog catalog,
183+
string outputColumnName,
184+
bool fixZero,
185+
string inputColumnName = null,
186+
long maximumExampleCount = NormalizingEstimator.Defaults.MaximumExampleCount,
187+
bool useCdf = NormalizingEstimator.Defaults.LogMeanVarCdf)
188+
{
189+
var columnOptions = new NormalizingEstimator.LogMeanVarianceColumnOptions(outputColumnName, inputColumnName, maximumExampleCount, useCdf, fixZero);
190+
return new NormalizingEstimator(CatalogUtils.GetEnvironment(catalog), columnOptions);
191+
}
192+
193+
/// <summary>
194+
/// Create a <see cref="NormalizingEstimator"/>, which normalizes based on the computed mean and variance of the logarithm of the data.
195+
/// </summary>
196+
/// <param name="catalog">The transform catalog</param>
197+
/// <param name="columns">The pairs of input and output columns.
198+
/// The input columns must be of data type <see cref="System.Single"/>, <see cref="System.Double"/> or a known-sized vector of those types.
199+
/// The data type for the output column will be the same as the associated input column.</param>
200+
/// <param name="fixZero">Whether to map zero to zero, preserving sparsity.</param>
201+
/// <param name="maximumExampleCount">Maximum number of examples used to train the normalizer.</param>
202+
/// <param name="useCdf">Whether to use CDF as the output.</param>
203+
public static NormalizingEstimator NormalizeLogMeanVariance(this TransformsCatalog catalog, InputOutputColumnPair[] columns,
204+
bool fixZero,
205+
long maximumExampleCount = NormalizingEstimator.Defaults.MaximumExampleCount,
206+
bool useCdf = NormalizingEstimator.Defaults.LogMeanVarCdf) =>
207+
new NormalizingEstimator(CatalogUtils.GetEnvironment(catalog),
208+
columns.Select(column =>
209+
new NormalizingEstimator.LogMeanVarianceColumnOptions(column.OutputColumnName, column.InputColumnName, maximumExampleCount, useCdf, fixZero)).ToArray());
163210

164211
/// <summary>
165212
/// Create a <see cref="NormalizingEstimator"/>, which normalizes by assigning the data into bins with equal density.

0 commit comments

Comments
 (0)