Skip to content

Hash Transform API that takes in advanced options. #4443

New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Merged
merged 7 commits into from
Dec 6, 2019
Merged
Show file tree
Hide file tree
Changes from 1 commit
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
Prev Previous commit
PR feedback.
  • Loading branch information
codemzs committed Dec 6, 2019
commit 6b8a35f099d9fdea7d55d585ff2c96095270721c
3 changes: 3 additions & 0 deletions src/DefaultGenApiDocIds.txt
Original file line number Diff line number Diff line change
@@ -0,0 +1,3 @@
// These attributes should be excluded from reference assemblies.

T:Microsoft.ML.BestFriendAttribute
1 change: 1 addition & 0 deletions src/Directory.Build.props
Original file line number Diff line number Diff line change
Expand Up @@ -31,6 +31,7 @@
API missing from old) -->
<RunApiCompatForSrc>true</RunApiCompatForSrc>
<RunMatchingRefApiCompat>false</RunMatchingRefApiCompat>
<ApiCompatExcludeAttributeList>$(MSBuildThisFileDirectory)DefaultGenApiDocIds.txt</ApiCompatExcludeAttributeList>
</PropertyGroup>

<ItemGroup>
Expand Down
9 changes: 5 additions & 4 deletions src/Microsoft.ML.Data/Transforms/Hashing.cs
Original file line number Diff line number Diff line change
Expand Up @@ -1134,7 +1134,7 @@ internal static class Defaults
}

/// <summary>
/// Describes how the transformer handles one column pair.
/// Initializes a new instance of the ColumnOptions class.
/// </summary>
public sealed class ColumnOptions
{
Expand Down Expand Up @@ -1163,9 +1163,6 @@ public sealed class ColumnOptions
/// </summary>
public int MaximumNumberOfInverts { get; set; }

/// <summary>
/// Describes how the transformer handles one column pair.
/// </summary>
/// <param name="name">Name of the column resulting from the transformation of <paramref name="inputColumnName"/>.</param>
/// <param name="inputColumnName">Name of column to transform. If set to <see langword="null"/>, the value of the <paramref name="name"/> will be used as source.</param>
/// <param name="numberOfBits">Number of bits to hash into. Must be between 1 and 31, inclusive.</param>
Expand All @@ -1189,6 +1186,10 @@ public ColumnOptions(string name,
UseOrderedHashing = useOrderedHashing;
MaximumNumberOfInverts = maximumNumberOfInverts;
}

public ColumnOptions()
{
}
}

/// <summary>
Expand Down
30 changes: 30 additions & 0 deletions test/Microsoft.ML.Functional.Tests/DataTransformation.cs
Original file line number Diff line number Diff line change
Expand Up @@ -9,6 +9,7 @@
using Microsoft.ML.Transforms.Text;
using Xunit;
using Xunit.Abstractions;
using static Microsoft.ML.Transforms.HashingEstimator;

namespace Microsoft.ML.Functional.Tests
{
Expand Down Expand Up @@ -185,6 +186,35 @@ void ExtensibilityNormalizeColumns()
Assert.InRange(row.Features[i], -1, 1);
}

[Fact]
void HashColumns()
{
// Concurrency must be 1 to assure that the mapping is done sequentially.
var mlContext = new MLContext(seed: 1);

// Load the Iris dataset.
var data = mlContext.Data.LoadFromTextFile<Iris>(
TestCommon.GetDataPath(DataDir, TestDatasets.iris.trainFilename),
hasHeader: TestDatasets.iris.fileHasHeader,
separatorChar: TestDatasets.iris.fileSeparator);

// Compose the transformation.
var pipeline = mlContext.Transforms.Concatenate("Features", Iris.Features)
.Append(mlContext.Transforms.Conversion.Hash(new[] {
new ColumnOptions() {
Name = "Features", InputColumnName = "Features", NumberOfBits = 31, UseOrderedHashing = true } }));

// Transform the data.
var transformedData = pipeline.Fit(data).Transform(data);

// Validate that the data was normalized to between -1 and 1.
var dataEnumerator = mlContext.Data.CreateEnumerable<HashedFeatureColumn>(transformedData, true);
foreach (var row in dataEnumerator)
// Verify per-slot normalization.
for (int i = 0; i < row.Features.Length; i++)
Assert.InRange(row.Features[i], (uint)0, (uint)Math.Pow(2, 31));
}

private float GetRandomNumber(float number)
{
var seed = (int)(10 * number);
Expand Down
5 changes: 5 additions & 0 deletions test/Microsoft.ML.Functional.Tests/Datasets/CommonColumns.cs
Original file line number Diff line number Diff line change
Expand Up @@ -12,6 +12,11 @@ internal sealed class FeatureColumn
public float[] Features { get; set; }
}

internal sealed class HashedFeatureColumn
{
public uint[] Features { get; set; }
}

/// <summary>
/// A class to hold the output of FeatureContributionCalculator
/// </summary>
Expand Down