Skip to content

IndexOutOfRange Exception in KeyToVector transformer #2681

New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Closed
Closed
Show file tree
Hide file tree
Changes from 1 commit
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
Prev Previous commit
Next Next commit
Add tests
  • Loading branch information
Ivan Matantsev committed Feb 21, 2019
commit fcdf0593c75db1230703552b2a0073bf07d6b3b8
2 changes: 1 addition & 1 deletion src/Microsoft.ML.Data/Transforms/Hashing.cs
Original file line number Diff line number Diff line change
Expand Up @@ -1273,7 +1273,7 @@ public SchemaShape GetOutputSchema(SchemaShape inputSchema)
metadata.Add(slotMeta);
if (colInfo.InvertHash != 0)
metadata.Add(new SchemaShape.Column(MetadataUtils.Kinds.KeyValues, SchemaShape.Column.VectorKind.Vector, TextDataViewType.Instance, false));
result[colInfo.Name] = new SchemaShape.Column(colInfo.Name, col.ItemType is VectorType ? SchemaShape.Column.VectorKind.Vector : SchemaShape.Column.VectorKind.Scalar, NumberDataViewType.UInt32, true, new SchemaShape(metadata));
Copy link
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

col.ItemType is VectorType ? SchemaShape.Column.VectorKind.Vector : SchemaShape.Column.VectorKind [](start = 76, length = 97)

Geez, the original author sure got confused.

result[colInfo.Name] = new SchemaShape.Column(colInfo.Name, col.Kind, NumberDataViewType.UInt32, true, new SchemaShape(metadata));
}
Copy link
Member

@wschin wschin Mar 6, 2019

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Does it mean input shape must be scalar? If yes, we need to throw when encountering a vector. #Resolved

Copy link
Contributor Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

No. It means we accept vectors, scalars and varvectors


In reply to: 263166153 [](ancestors = 263166153)

return new SchemaShape(result.Values);
}
Expand Down
5 changes: 3 additions & 2 deletions src/Microsoft.ML.Data/Transforms/KeyToVector.cs
Original file line number Diff line number Diff line change
Expand Up @@ -560,7 +560,7 @@ private ValueGetter<VBuffer<float>> MakeGetterInd(DataViewRow input, int iinfo)
int lenDst = checked(size * lenSrc);
var values = src.GetValues();
int cntSrc = values.Length;
var editor = VBufferEditor.Create(ref dst, lenDst, cntSrc);
var editor = VBufferEditor.Create(ref dst, lenDst, valuesCount: cntSrc, requireIndicesOnDense: true);

int count = 0;
if (src.IsDense)
Expand All @@ -573,6 +573,7 @@ private ValueGetter<VBuffer<float>> MakeGetterInd(DataViewRow input, int iinfo)
if (key >= (uint)size)
continue;
editor.Values[count] = 1;
editor.Indices[count++] = slot * size + (int)key;
}
}
else
Expand Down Expand Up @@ -793,7 +794,7 @@ public override SchemaShape GetOutputSchema(SchemaShape inputSchema)

var metadata = new List<SchemaShape.Column>();
if (col.Metadata.TryFindColumn(MetadataUtils.Kinds.KeyValues, out var keyMeta))
if (col.Kind != SchemaShape.Column.VectorKind.VariableVector && keyMeta.ItemType is TextDataViewType)
if (((colInfo.Bag && col.IsKey) || col.Kind != SchemaShape.Column.VectorKind.VariableVector) && keyMeta.ItemType is TextDataViewType)
metadata.Add(new SchemaShape.Column(MetadataUtils.Kinds.SlotNames, SchemaShape.Column.VectorKind.Vector, keyMeta.ItemType, false));
if (!colInfo.Bag && (col.Kind == SchemaShape.Column.VectorKind.Scalar || col.Kind == SchemaShape.Column.VectorKind.Vector))
metadata.Add(new SchemaShape.Column(MetadataUtils.Kinds.CategoricalSlotRanges, SchemaShape.Column.VectorKind.Vector, NumberDataViewType.Int32, false));
Expand Down
23 changes: 23 additions & 0 deletions test/BaselineOutput/Common/Categorical/oneHot.tsv
Original file line number Diff line number Diff line change
@@ -0,0 +1,23 @@
#@ TextLoader{
#@ header+
#@ sep=tab
#@ col=A:I4:0
#@ col=B:I4:1-2
#@ col=CatA:U4[2]:3
#@ col=CatA:R4:4-5
#@ col=CatB:U4[2]:6
#@ col=CatB:R4:7-9
#@ col=CatC:U4[2]:10
#@ col=CatC:R4:11-12
#@ col=CatD:U4[2]:13
#@ col=CatVA:U4[3]:14-15
#@ col=CatVA:R4:16-18
#@ col=CatVB:U4[3]:19-20
#@ col=CatVB:R4:21-26
#@ col=CatVC:U4[3]:27-28
#@ col=CatVC:R4:29-34
#@ col=CatVD:U4[3]:35-36
#@ }
A "" "" CatA 1 4 CatB Bit2 Bit1 Bit0 CatC 1 4 CatD "" "" 2 3 4 "" "" [0].Bit2 [0].Bit1 [0].Bit0 [1].Bit2 [1].Bit1 [1].Bit0 "" "" [0].2 [0].3 [0].4 [1].2 [1].3 [1].4 "" ""
1 2 3 0 1 0 0 0 0 0 0 1 0 0 0 1 1 1 0 0 1 0 0 0 0 0 1 0 1 1 0 0 0 1 0 0 1
4 2 4 1 0 1 1 0 0 1 1 0 1 1 0 2 1 0 1 0 2 0 0 0 0 1 0 0 2 1 0 0 0 0 1 0 2
23 changes: 23 additions & 0 deletions test/BaselineOutput/Common/CategoricalHash/oneHotHash.tsv
Original file line number Diff line number Diff line change
@@ -0,0 +1,23 @@
#@ TextLoader{
#@ header+
#@ sep=tab
#@ col=A:TX:0
#@ col=B:TX:1-2
#@ col=CatA:U4[65536]:3
#@ col=CatA:R4:4-65539
#@ col=CatB:U4[65536]:65540
#@ col=CatB:R4:65541-65558
#@ col=CatC:U4[65536]:65559
#@ col=CatC:R4:65560-131095
#@ col=CatD:U4[65536]:131096
#@ col=CatVA:U4[65536]:131097-131098
#@ col=CatVA:R4:131099-196634
#@ col=CatVB:U4[65536]:196635-196636
#@ col=CatVB:R4:196637-196672
#@ col=CatVC:U4[65536]:196673-196674
#@ col=CatVC:R4:196675-327746
#@ col=CatVD:U4[65536]:327747-327748
#@ }
A 327748 2:CatA 65539:CatB 65558:CatC 131095:CatD
1 2 3 17369 327745 17369:1 65536:17369 65540:1 65545:1 65546:1 65547:1 65548:1 65550:1 65551:1 65554:1 65555:17369 82925:1 131092:17369 131093:45477 131094:61578 176572:1 192673:1 196631:45477 196632:61578 196635:1 196637:1 196638:1 196642:1 196643:1 196645:1 196648:1 196650:1 196653:1 196654:1 196655:1 196656:1 196661:1 196665:1 196667:1 196669:45477 196670:61578 242148:1 323785:1 327743:45477 327744:61578
4 4 5 20750 327745 20750:1 65536:20750 65540:1 65542:1 65546:1 65551:1 65552:1 65553:1 65555:20750 86306:1 131092:20750 131093:20750 131094:23709 151845:1 154804:1 196631:20750 196632:23709 196636:1 196638:1 196642:1 196647:1 196648:1 196649:1 196654:1 196656:1 196657:1 196658:1 196661:1 196664:1 196665:1 196666:1 196668:1 196669:20750 196670:23709 217421:1 285916:1 327743:20750 327744:23709
14 changes: 0 additions & 14 deletions test/BaselineOutput/SingleRelease/Categorical/featurized.tsv

This file was deleted.

13 changes: 0 additions & 13 deletions test/BaselineOutput/SingleRelease/CategoricalHash/featurized.tsv

This file was deleted.

23 changes: 16 additions & 7 deletions test/Microsoft.ML.Tests/Transformers/CategoricalHashTests.cs
Original file line number Diff line number Diff line change
Expand Up @@ -28,8 +28,8 @@ public CategoricalHashTests(ITestOutputHelper output) : base(output)
private class TestClass
{
public string A;
public string B;
public string C;
[VectorType(2)]
public string[] B;
}

private class TestMeta
Expand All @@ -48,17 +48,27 @@ private class TestMeta
[Fact]
public void CategoricalHashWorkout()
{
var data = new[] { new TestClass() { A = "1", B = "2", C = "3", }, new TestClass() { A = "4", B = "5", C = "6" } };
var data = new[] { new TestClass() { A = "1", B = new[] { "2", "3" } }, new TestClass() { A = "4", B = new[] { "4", "5" } } };

var dataView = ML.Data.ReadFromEnumerable(data);
var pipe = ML.Transforms.Categorical.OneHotHashEncoding(new[]{
new OneHotHashEncodingEstimator.ColumnInfo("CatA", "A", OneHotEncodingTransformer.OutputKind.Bag),
new OneHotHashEncodingEstimator.ColumnInfo("CatB", "A", OneHotEncodingTransformer.OutputKind.Bin),
new OneHotHashEncodingEstimator.ColumnInfo("CatC", "A", OneHotEncodingTransformer.OutputKind.Ind),
new OneHotHashEncodingEstimator.ColumnInfo("CatD", "A", OneHotEncodingTransformer.OutputKind.Key),
new OneHotHashEncodingEstimator.ColumnInfo("CatVA", "B", OneHotEncodingTransformer.OutputKind.Bag),
new OneHotHashEncodingEstimator.ColumnInfo("CatVB", "B", OneHotEncodingTransformer.OutputKind.Bin),
new OneHotHashEncodingEstimator.ColumnInfo("CatVC", "B", OneHotEncodingTransformer.OutputKind.Ind),
new OneHotHashEncodingEstimator.ColumnInfo("CatVD", "B", OneHotEncodingTransformer.OutputKind.Key),
});

TestEstimatorCore(pipe, dataView);
var outputPath = GetOutputPath("CategoricalHash", "oneHotHash.tsv");
var savedData = pipe.Fit(dataView).Transform(dataView);

using (var fs = File.Create(outputPath))
ML.Data.SaveAsText(savedData, fs, headerRow: true, keepHidden: true);
CheckEquality("CategoricalHash", "oneHotHash.tsv");
Done();
}

Expand All @@ -70,7 +80,7 @@ public void CategoricalHashStatic()
ScalarString: ctx.LoadText(1),
VectorString: ctx.LoadText(1, 4)));
var data = reader.Read(dataPath);
var wrongCollection = new[] { new TestClass() { A = "1", B = "2", C = "3", }, new TestClass() { A = "4", B = "5", C = "6" } };
var wrongCollection = new[] { new TestClass() { A = "1", B = new[] { "2", "3" } }, new TestClass() { A = "4", B = new[] { "4", "5" } } };

var invalidData = ML.Data.ReadFromEnumerable(wrongCollection);
var est = data.MakeNewEstimator().
Expand Down Expand Up @@ -210,12 +220,11 @@ public void TestCommandLine()
[Fact]
public void TestOldSavingAndLoading()
{
var data = new[] { new TestClass() { A = "1", B = "2", C = "3", }, new TestClass() { A = "4", B = "5", C = "6" } };
var data = new[] { new TestClass() { A = "1", B = new[] { "2", "3" } }, new TestClass() { A = "4", B = new[] { "4", "5" } } };
var dataView = ML.Data.ReadFromEnumerable(data);
var pipe = ML.Transforms.Categorical.OneHotHashEncoding(new[]{
new OneHotHashEncodingEstimator.ColumnInfo("CatHashA", "A"),
new OneHotHashEncodingEstimator.ColumnInfo("CatHashB", "B"),
new OneHotHashEncodingEstimator.ColumnInfo("CatHashC", "C")
new OneHotHashEncodingEstimator.ColumnInfo("CatHashB", "B")
});
var result = pipe.Fit(dataView).Transform(dataView);
var resultRoles = new RoleMappedData(result);
Expand Down
40 changes: 17 additions & 23 deletions test/Microsoft.ML.Tests/Transformers/CategoricalTests.cs
Original file line number Diff line number Diff line change
Expand Up @@ -28,8 +28,8 @@ public CategoricalTests(ITestOutputHelper output) : base(output)
private sealed class TestClass
{
public int A;
public int B;
public int C;
[VectorType(2)]
public int[] B;
}

private sealed class TestMeta
Expand All @@ -56,41 +56,36 @@ private sealed class TestStringClass
[Fact]
public void CategoricalWorkout()
{
var data = new[] { new TestClass() { A = 1, B = 2, C = 3, }, new TestClass() { A = 4, B = 5, C = 6 } };
var data = new[] { new TestClass() { A = 1, B = new int[2] { 2, 3 } }, new TestClass() { A = 4, B = new int[2] { 2, 4 } } };

var dataView = ML.Data.ReadFromEnumerable(data);
var pipe = ML.Transforms.Categorical.OneHotEncoding(new[]{
new OneHotEncodingEstimator.ColumnInfo("CatA", "A", OneHotEncodingTransformer.OutputKind.Bag),
new OneHotEncodingEstimator.ColumnInfo("CatB", "A", OneHotEncodingTransformer.OutputKind.Bin),
new OneHotEncodingEstimator.ColumnInfo("CatC", "A", OneHotEncodingTransformer.OutputKind.Ind),
new OneHotEncodingEstimator.ColumnInfo("CatD", "A", OneHotEncodingTransformer.OutputKind.Key),
new OneHotEncodingEstimator.ColumnInfo("CatVA", "B", OneHotEncodingTransformer.OutputKind.Bag),
new OneHotEncodingEstimator.ColumnInfo("CatVB", "B", OneHotEncodingTransformer.OutputKind.Bin),
new OneHotEncodingEstimator.ColumnInfo("CatVC", "B", OneHotEncodingTransformer.OutputKind.Ind),
new OneHotEncodingEstimator.ColumnInfo("CatVD", "B", OneHotEncodingTransformer.OutputKind.Key),
});

TestEstimatorCore(pipe, dataView);
Done();
}
var outputPath = GetOutputPath("Categorical", "oneHot.tsv");
var savedData = pipe.Fit(dataView).Transform(dataView);

[Fact]
public void CategoricalOneHotHashEncoding()
{
var data = new[] { new TestClass() { A = 1, B = 2, C = 3, }, new TestClass() { A = 4, B = 5, C = 6 } };

var mlContext = new MLContext();
var dataView = mlContext.Data.ReadFromEnumerable(data);

var pipe = mlContext.Transforms.Categorical.OneHotHashEncoding("CatA", "A", 3, 0, OneHotEncodingTransformer.OutputKind.Bag)
.Append(mlContext.Transforms.Categorical.OneHotHashEncoding("CatB", "A", 2, 0, OneHotEncodingTransformer.OutputKind.Key))
.Append(mlContext.Transforms.Categorical.OneHotHashEncoding("CatC", "A", 3, 0, OneHotEncodingTransformer.OutputKind.Ind))
.Append(mlContext.Transforms.Categorical.OneHotHashEncoding("CatD", "A", 2, 0, OneHotEncodingTransformer.OutputKind.Bin));

TestEstimatorCore(pipe, dataView);
using (var fs = File.Create(outputPath))
ML.Data.SaveAsText(savedData, fs, headerRow: true, keepHidden: true);
CheckEquality("Categorical", "oneHot.tsv");
Done();
}



[Fact]
public void CategoricalOneHotEncoding()
{
var data = new[] { new TestClass() { A = 1, B = 2, C = 3, }, new TestClass() { A = 4, B = 5, C = 6 } };
var data = new[] { new TestClass() { A = 1, B = new int[2] { 2, 3 } }, new TestClass() { A = 4, B = new int[2] { 2, 4 } } };

var mlContext = new MLContext();
var dataView = mlContext.Data.ReadFromEnumerable(data);
Expand Down Expand Up @@ -145,7 +140,7 @@ public void CategoricalStatic()
ScalarString: ctx.LoadText(1),
VectorString: ctx.LoadText(1, 4)));
var data = reader.Read(dataPath);
var wrongCollection = new[] { new TestClass() { A = 1, B = 2, C = 3, }, new TestClass() { A = 4, B = 5, C = 6 } };
var wrongCollection = new[] { new TestClass() { A = 1, B = new int[2] { 2, 3 } }, new TestClass() { A = 4, B = new int[2] { 2, 4 } } };

var invalidData = ML.Data.ReadFromEnumerable(wrongCollection);
var est = data.MakeNewEstimator().
Expand Down Expand Up @@ -300,12 +295,11 @@ public void TestCommandLine()
[Fact]
public void TestOldSavingAndLoading()
{
var data = new[] { new TestClass() { A = 1, B = 2, C = 3, }, new TestClass() { A = 4, B = 5, C = 6 } };
var data = new[] { new TestClass() { A = 1, B = new int[2] { 2, 3 } }, new TestClass() { A = 4, B = new int[2] { 2, 4 } } };
var dataView = ML.Data.ReadFromEnumerable(data);
var pipe = ML.Transforms.Categorical.OneHotEncoding(new[]{
new OneHotEncodingEstimator.ColumnInfo("TermA", "A"),
new OneHotEncodingEstimator.ColumnInfo("TermB", "B"),
new OneHotEncodingEstimator.ColumnInfo("TermC", "C")
});
var result = pipe.Fit(dataView).Transform(dataView);
var resultRoles = new RoleMappedData(result);
Expand Down