Skip to content

Auto.ML: Fix issue when parsing float string fails on pl-PL culture set using Regression Experiment #5163

New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Merged
merged 14 commits into from
Oct 30, 2020
Merged
2 changes: 1 addition & 1 deletion src/Microsoft.ML.AutoML/Sweepers/Parameters.cs
Original file line number Diff line number Diff line change
Expand Up @@ -83,7 +83,7 @@ public LongParameterValue(string name, long value)
{
_name = name;
_value = value;
_valueText = _value.ToString("D");
_valueText = _value.ToString("D", CultureInfo.InvariantCulture);
}

public bool Equals(IParameterValue other)
Expand Down
11 changes: 9 additions & 2 deletions src/Microsoft.ML.AutoML/Sweepers/SweeperProbabilityUtils.cs
Original file line number Diff line number Diff line change
Expand Up @@ -4,6 +4,7 @@

using System;
using System.Collections.Generic;
using System.Globalization;
using Microsoft.ML.Internal.CpuMath;

namespace Microsoft.ML.AutoML
Expand Down Expand Up @@ -98,13 +99,15 @@ public static float[] ParameterSetAsFloatArray(IValueGenerator[] sweepParams, Pa
}
else if (sweepParam is LongValueGenerator lvg)
{
var longValue = GetIfIParameterValueOfT<long>(pset) ?? long.Parse(pset.ValueText, CultureInfo.InvariantCulture);
// Normalizing all numeric parameters to [0,1] range.
result.Add(lvg.NormalizeValue(new LongParameterValue(pset.Name, long.Parse(pset.ValueText))));
result.Add(lvg.NormalizeValue(new LongParameterValue(pset.Name, longValue)));
}
else if (sweepParam is FloatValueGenerator fvg)
{
var floatValue = GetIfIParameterValueOfT<float>(pset) ?? float.Parse(pset.ValueText, CultureInfo.InvariantCulture);
// Normalizing all numeric parameters to [0,1] range.
result.Add(fvg.NormalizeValue(new FloatParameterValue(pset.Name, float.Parse(pset.ValueText))));
result.Add(fvg.NormalizeValue(new FloatParameterValue(pset.Name, floatValue)));
}
else
{
Expand All @@ -115,6 +118,10 @@ public static float[] ParameterSetAsFloatArray(IValueGenerator[] sweepParams, Pa
return result.ToArray();
}

private static T? GetIfIParameterValueOfT<T>(IParameterValue parameterValue)
where T : struct =>
parameterValue is IParameterValue<T> pvt ? pvt.Value : default(T?);

public static ParameterSet FloatArrayAsParameterSet(IValueGenerator[] sweepParams, float[] array, bool expandedCategoricals = true)
{
Runtime.Contracts.Assert(array.Length == sweepParams.Length);
Expand Down
66 changes: 51 additions & 15 deletions test/Microsoft.ML.AutoML.Tests/AutoFitTests.cs
Original file line number Diff line number Diff line change
Expand Up @@ -2,7 +2,10 @@
// The .NET Foundation licenses this file to you under the MIT license.
// See the LICENSE file in the project root for more information.

using System;
using System.Globalization;
using System.Linq;
using System.Threading;
using Microsoft.ML.Data;
using Microsoft.ML.TestFramework;
using Microsoft.ML.TestFramework.Attributes;
Expand Down Expand Up @@ -102,22 +105,55 @@ private void Context_Log(object sender, LoggingEventArgs e)
//throw new NotImplementedException();
}

[Fact]
public void AutoFitRegressionTest()
[Theory]
[InlineData("en-US")]
[InlineData("ar-SA")]
[InlineData("pl-PL")]
public void AutoFitRegressionTest(string culture)
{
var context = new MLContext(1);
var dataPath = DatasetUtil.GetMlNetGeneratedRegressionDataset();
var columnInference = context.Auto().InferColumns(dataPath, DatasetUtil.MlNetGeneratedRegressionLabel);
var textLoader = context.Data.CreateTextLoader(columnInference.TextLoaderOptions);
var trainData = textLoader.Load(dataPath);
var validationData = context.Data.TakeRows(trainData, 20);
trainData = context.Data.SkipRows(trainData, 20);
var result = context.Auto()
.CreateRegressionExperiment(0)
.Execute(trainData, validationData,
new ColumnInformation() { LabelColumnName = DatasetUtil.MlNetGeneratedRegressionLabel });
var originalCulture = Thread.CurrentThread.CurrentCulture;
try
{
Thread.CurrentThread.CurrentCulture = new CultureInfo(culture);

// If users run AutoML with a different locale, sometimes
// the sweeper encounters problems when parsing some strings.
// So testing in another culture is necessary.
// Furthermore, these issues might only occur after ~70
// iterations, so more experiment time is needed for this to
// occur.
uint experimentTime = (uint) (culture == "en-US" ? 0 : 180);

var experimentSettings = new RegressionExperimentSettings { MaxExperimentTimeInSeconds = experimentTime};
if (!Environment.Is64BitProcess)
{
// LightGBM isn't available on x86 machines
experimentSettings.Trainers.Remove(RegressionTrainer.LightGbm);
}

var context = new MLContext(1);
var dataPath = DatasetUtil.GetMlNetGeneratedRegressionDataset();
var columnInference = context.Auto().InferColumns(dataPath, DatasetUtil.MlNetGeneratedRegressionLabel);
var textLoader = context.Data.CreateTextLoader(columnInference.TextLoaderOptions);
var trainData = textLoader.Load(dataPath);
var validationData = context.Data.TakeRows(trainData, 20);
trainData = context.Data.SkipRows(trainData, 20);
var result = context.Auto()
.CreateRegressionExperiment(experimentSettings)
.Execute(trainData, validationData,
new ColumnInformation() { LabelColumnName = DatasetUtil.MlNetGeneratedRegressionLabel });

Assert.True(result.RunDetails.Max(i => i.ValidationMetrics.RSquared > 0.9));

// Ensure experimentTime allows enough iterations to fully test the internationalization code
// If the below assertion fails, increase the experiment time so the number of iterations is met
Assert.True(culture == "en-US" || result.RunDetails.Count() >= 75, $"RunDetails.Count() = {result.RunDetails.Count()}, below 75");

Assert.True(result.RunDetails.Max(i => i.ValidationMetrics.RSquared > 0.9));
}
finally
{
Thread.CurrentThread.CurrentCulture = originalCulture;
}
}

[LightGBMFact]
Expand Down Expand Up @@ -351,4 +387,4 @@ private TextLoader.Options GetLoaderArgsRank(string labelColumnName, string grou
};
}
}
}
}