From 5d98e8e8e0f85cec518f1a16c9f25d24985193e3 Mon Sep 17 00:00:00 2001 From: Zeeshan Siddiqui Date: Wed, 22 May 2019 02:34:50 -0700 Subject: [PATCH 1/5] Create indicies array for sparse vector in KeyToValue transfomer only when resulting vector is sparse. --- .../Transforms/KeyToVector.cs | 18 +++++++++-- .../Transformers/CategoricalTests.cs | 31 +++++++++++++++++++ 2 files changed, 47 insertions(+), 2 deletions(-) diff --git a/src/Microsoft.ML.Data/Transforms/KeyToVector.cs b/src/Microsoft.ML.Data/Transforms/KeyToVector.cs index 01fc39048c..1249649030 100644 --- a/src/Microsoft.ML.Data/Transforms/KeyToVector.cs +++ b/src/Microsoft.ML.Data/Transforms/KeyToVector.cs @@ -573,7 +573,14 @@ private ValueGetter> MakeGetterInd(DataViewRow input, int iinfo) if (key >= (uint)size) continue; editor.Values[count] = 1; - editor.Indices[count++] = slot * size + (int)key; + // Indices is only created when the vector is deemed to be dense. + // Alternatively we can force VBufferEditor to create an Indices array + // even in the case of dense vectors but this will result in increased + // memory footprint. + if (lenDst != cntSrc) + editor.Indices[count++] = slot * size + (int)key; + else + count++; } } else @@ -586,7 +593,14 @@ private ValueGetter> MakeGetterInd(DataViewRow input, int iinfo) if (key >= (uint)size) continue; editor.Values[count] = 1; - editor.Indices[count++] = indices[islot] * size + (int)key; + // Indices is only created when the vector is deemed to be dense. + // Alternatively we can force VBufferEditor to create an Indices array + // even in the case of dense vectors but this will result in increased + // memory footprint. + if (lenDst != cntSrc) + editor.Indices[count++] = indices[islot] * size + (int)key; + else + count++; } } dst = editor.CommitTruncated(count); diff --git a/test/Microsoft.ML.Tests/Transformers/CategoricalTests.cs b/test/Microsoft.ML.Tests/Transformers/CategoricalTests.cs index 5d27514eb1..21c9daeccd 100644 --- a/test/Microsoft.ML.Tests/Transformers/CategoricalTests.cs +++ b/test/Microsoft.ML.Tests/Transformers/CategoricalTests.cs @@ -29,6 +29,13 @@ private sealed class TestClass public int C; } + private sealed class TestClassWithLabel + { + public int A; + public int B; + public bool Label; + } + private sealed class TestMeta { [VectorType(2)] @@ -101,6 +108,30 @@ public void CategoricalOneHotEncoding() Done(); } + [Fact] + public void CategoricalOneHotEncodingVector() + { + var data = new[] { + new TestClassWithLabel() { A = 301, B = 2000, Label = true }, + new TestClassWithLabel() { A = 450, B = 3000, Label = true }, + new TestClassWithLabel() { A = -300, B = 4000, Label = true }, + new TestClassWithLabel() { A = 300, B = 2000, Label = false }, + new TestClassWithLabel() { A = 115, B = 2000, Label = false }, + new TestClassWithLabel() { A = 115, B = 2000, Label = false }}; + + var mlContext = new MLContext(); + var dataView = mlContext.Data.LoadFromEnumerable(data); + var pipe = mlContext.Transforms.Conversion.ConvertType("A", outputKind: DataKind.Single) + .Append(mlContext.Transforms.Conversion.ConvertType("B", outputKind: DataKind.Single)) + .Append(mlContext.Transforms.Concatenate("Features", new string[] { "A", "B" })) + .Append(mlContext.Transforms.Conversion.MapValueToKey("Label")) + .Append(mlContext.Transforms.NormalizeSupervisedBinning("Features", fixZero: false, maximumBinCount: 5, labelColumnName: "Label")) + .Append(mlContext.Transforms.Categorical.OneHotEncoding("Features", outputKind: OneHotEncodingEstimator.OutputKind.Indicator)); + + TestEstimatorCore(pipe, dataView); + Done(); + } + /// /// In which we take a categorical value and map it to a vector, but we get the mapping from a side data view /// rather than the data we are fitting. From ca23e7d2e7c68afad4a49cb85eb942472d3b1b01 Mon Sep 17 00:00:00 2001 From: Zeeshan Siddiqui Date: Wed, 22 May 2019 13:23:37 -0700 Subject: [PATCH 2/5] PR feedback. --- .../Transforms/KeyToVector.cs | 21 ++++--------------- 1 file changed, 4 insertions(+), 17 deletions(-) diff --git a/src/Microsoft.ML.Data/Transforms/KeyToVector.cs b/src/Microsoft.ML.Data/Transforms/KeyToVector.cs index 1249649030..467a9c65cf 100644 --- a/src/Microsoft.ML.Data/Transforms/KeyToVector.cs +++ b/src/Microsoft.ML.Data/Transforms/KeyToVector.cs @@ -560,7 +560,7 @@ private ValueGetter> MakeGetterInd(DataViewRow input, int iinfo) int lenDst = checked(size * lenSrc); var values = src.GetValues(); int cntSrc = values.Length; - var editor = VBufferEditor.Create(ref dst, lenDst, cntSrc); + var editor = VBufferEditor.Create(ref dst, lenDst, cntSrc, keepOldOnResize: false, requireIndicesOnDense: true); int count = 0; if (src.IsDense) @@ -573,14 +573,8 @@ private ValueGetter> MakeGetterInd(DataViewRow input, int iinfo) if (key >= (uint)size) continue; editor.Values[count] = 1; - // Indices is only created when the vector is deemed to be dense. - // Alternatively we can force VBufferEditor to create an Indices array - // even in the case of dense vectors but this will result in increased - // memory footprint. - if (lenDst != cntSrc) - editor.Indices[count++] = slot * size + (int)key; - else - count++; + editor.Indices[count++] = slot * size + (int)key; + } } else @@ -593,14 +587,7 @@ private ValueGetter> MakeGetterInd(DataViewRow input, int iinfo) if (key >= (uint)size) continue; editor.Values[count] = 1; - // Indices is only created when the vector is deemed to be dense. - // Alternatively we can force VBufferEditor to create an Indices array - // even in the case of dense vectors but this will result in increased - // memory footprint. - if (lenDst != cntSrc) - editor.Indices[count++] = indices[islot] * size + (int)key; - else - count++; + editor.Indices[count++] = indices[islot] * size + (int)key; } } dst = editor.CommitTruncated(count); From 2ae51013563adbc8e10800804d0b0420158ee4c5 Mon Sep 17 00:00:00 2001 From: Zeeshan Siddiqui Date: Wed, 22 May 2019 13:25:56 -0700 Subject: [PATCH 3/5] cleanup. --- src/Microsoft.ML.Data/Transforms/KeyToVector.cs | 1 - 1 file changed, 1 deletion(-) diff --git a/src/Microsoft.ML.Data/Transforms/KeyToVector.cs b/src/Microsoft.ML.Data/Transforms/KeyToVector.cs index 467a9c65cf..7cd70bcd70 100644 --- a/src/Microsoft.ML.Data/Transforms/KeyToVector.cs +++ b/src/Microsoft.ML.Data/Transforms/KeyToVector.cs @@ -574,7 +574,6 @@ private ValueGetter> MakeGetterInd(DataViewRow input, int iinfo) continue; editor.Values[count] = 1; editor.Indices[count++] = slot * size + (int)key; - } } else From cd8c3345141c9a8900dc2eb3bc086ea156d979ad Mon Sep 17 00:00:00 2001 From: Zeeshan Siddiqui Date: Wed, 22 May 2019 13:37:16 -0700 Subject: [PATCH 4/5] cleanup. --- .../Transformers/CategoricalTests.cs | 10 +++++----- 1 file changed, 5 insertions(+), 5 deletions(-) diff --git a/test/Microsoft.ML.Tests/Transformers/CategoricalTests.cs b/test/Microsoft.ML.Tests/Transformers/CategoricalTests.cs index 21c9daeccd..93789f9a60 100644 --- a/test/Microsoft.ML.Tests/Transformers/CategoricalTests.cs +++ b/test/Microsoft.ML.Tests/Transformers/CategoricalTests.cs @@ -122,11 +122,11 @@ public void CategoricalOneHotEncodingVector() var mlContext = new MLContext(); var dataView = mlContext.Data.LoadFromEnumerable(data); var pipe = mlContext.Transforms.Conversion.ConvertType("A", outputKind: DataKind.Single) - .Append(mlContext.Transforms.Conversion.ConvertType("B", outputKind: DataKind.Single)) - .Append(mlContext.Transforms.Concatenate("Features", new string[] { "A", "B" })) - .Append(mlContext.Transforms.Conversion.MapValueToKey("Label")) - .Append(mlContext.Transforms.NormalizeSupervisedBinning("Features", fixZero: false, maximumBinCount: 5, labelColumnName: "Label")) - .Append(mlContext.Transforms.Categorical.OneHotEncoding("Features", outputKind: OneHotEncodingEstimator.OutputKind.Indicator)); + .Append(mlContext.Transforms.Conversion.ConvertType("B", outputKind: DataKind.Single)) + .Append(mlContext.Transforms.Concatenate("Features", new string[] { "A", "B" })) + .Append(mlContext.Transforms.Conversion.MapValueToKey("Label")) + .Append(mlContext.Transforms.NormalizeSupervisedBinning("Features", fixZero: false, maximumBinCount: 5, labelColumnName: "Label")) + .Append(mlContext.Transforms.Categorical.OneHotEncoding("Features", outputKind: OneHotEncodingEstimator.OutputKind.Indicator)); TestEstimatorCore(pipe, dataView); Done(); From d4328344a11dbb1e01ff142f3064ec81345083e0 Mon Sep 17 00:00:00 2001 From: Zeeshan Siddiqui Date: Wed, 22 May 2019 14:03:21 -0700 Subject: [PATCH 5/5] port changes from PR #2678. --- .../Transforms/KeyToVector.cs | 6 +- .../KeyToVectorMapping.cs | 4 +- .../Categorical/featurized.tsv | 0 .../Common/Categorical/oneHot.tsv | 31 ++++++++ .../CategoricalHash/featurized.tsv | 0 .../Common/CategoricalHash/oneHotHash.tsv | 31 ++++++++ .../SingleRelease/Categorical/featurized.tsv | 14 ---- .../CategoricalHash/featurized.tsv | 13 ---- .../Transformers/CategoricalHashTests.cs | 29 ++++++-- .../Transformers/CategoricalTests.cs | 70 ++++++++----------- 10 files changed, 120 insertions(+), 78 deletions(-) rename test/BaselineOutput/{SingleDebug => Common}/Categorical/featurized.tsv (100%) create mode 100644 test/BaselineOutput/Common/Categorical/oneHot.tsv rename test/BaselineOutput/{SingleDebug => Common}/CategoricalHash/featurized.tsv (100%) create mode 100644 test/BaselineOutput/Common/CategoricalHash/oneHotHash.tsv delete mode 100644 test/BaselineOutput/SingleRelease/Categorical/featurized.tsv delete mode 100644 test/BaselineOutput/SingleRelease/CategoricalHash/featurized.tsv diff --git a/src/Microsoft.ML.Data/Transforms/KeyToVector.cs b/src/Microsoft.ML.Data/Transforms/KeyToVector.cs index 7cd70bcd70..12c0d7967c 100644 --- a/src/Microsoft.ML.Data/Transforms/KeyToVector.cs +++ b/src/Microsoft.ML.Data/Transforms/KeyToVector.cs @@ -814,14 +814,16 @@ public override SchemaShape GetOutputSchema(SchemaShape inputSchema) var metadata = new List(); if (col.Annotations.TryFindColumn(AnnotationUtils.Kinds.KeyValues, out var keyMeta)) - if (col.Kind != SchemaShape.Column.VectorKind.VariableVector && keyMeta.ItemType is TextDataViewType) + if (((colInfo.OutputCountVector && col.IsKey) || col.Kind != SchemaShape.Column.VectorKind.VariableVector) && keyMeta.ItemType is TextDataViewType) metadata.Add(new SchemaShape.Column(AnnotationUtils.Kinds.SlotNames, SchemaShape.Column.VectorKind.Vector, keyMeta.ItemType, false)); if (!colInfo.OutputCountVector && (col.Kind == SchemaShape.Column.VectorKind.Scalar || col.Kind == SchemaShape.Column.VectorKind.Vector)) metadata.Add(new SchemaShape.Column(AnnotationUtils.Kinds.CategoricalSlotRanges, SchemaShape.Column.VectorKind.Vector, NumberDataViewType.Int32, false)); if (!colInfo.OutputCountVector || (col.Kind == SchemaShape.Column.VectorKind.Scalar)) metadata.Add(new SchemaShape.Column(AnnotationUtils.Kinds.IsNormalized, SchemaShape.Column.VectorKind.Scalar, BooleanDataViewType.Instance, false)); - result[colInfo.Name] = new SchemaShape.Column(colInfo.Name, SchemaShape.Column.VectorKind.Vector, NumberDataViewType.Single, false, new SchemaShape(metadata)); + result[colInfo.Name] = new SchemaShape.Column(colInfo.Name, + col.Kind == SchemaShape.Column.VectorKind.VariableVector && !colInfo.OutputCountVector ? SchemaShape.Column.VectorKind.VariableVector : SchemaShape.Column.VectorKind.Vector, + NumberDataViewType.Single, false, new SchemaShape(metadata)); } return new SchemaShape(result.Values); diff --git a/src/Microsoft.ML.Transforms/KeyToVectorMapping.cs b/src/Microsoft.ML.Transforms/KeyToVectorMapping.cs index bc3bf5c81d..256d1aecb0 100644 --- a/src/Microsoft.ML.Transforms/KeyToVectorMapping.cs +++ b/src/Microsoft.ML.Transforms/KeyToVectorMapping.cs @@ -480,7 +480,9 @@ public override SchemaShape GetOutputSchema(SchemaShape inputSchema) metadata.Add(new SchemaShape.Column(AnnotationUtils.Kinds.SlotNames, SchemaShape.Column.VectorKind.Vector, keyMeta.ItemType, false)); if (col.Kind == SchemaShape.Column.VectorKind.Scalar) metadata.Add(new SchemaShape.Column(AnnotationUtils.Kinds.IsNormalized, SchemaShape.Column.VectorKind.Scalar, BooleanDataViewType.Instance, false)); - result[colInfo.outputColumnName] = new SchemaShape.Column(colInfo.outputColumnName, SchemaShape.Column.VectorKind.Vector, NumberDataViewType.Single, false, new SchemaShape(metadata)); + result[colInfo.outputColumnName] = new SchemaShape.Column(colInfo.outputColumnName, + col.Kind == SchemaShape.Column.VectorKind.VariableVector ? SchemaShape.Column.VectorKind.VariableVector : SchemaShape.Column.VectorKind.Vector, + NumberDataViewType.Single, false, new SchemaShape(metadata)); } return new SchemaShape(result.Values); diff --git a/test/BaselineOutput/SingleDebug/Categorical/featurized.tsv b/test/BaselineOutput/Common/Categorical/featurized.tsv similarity index 100% rename from test/BaselineOutput/SingleDebug/Categorical/featurized.tsv rename to test/BaselineOutput/Common/Categorical/featurized.tsv diff --git a/test/BaselineOutput/Common/Categorical/oneHot.tsv b/test/BaselineOutput/Common/Categorical/oneHot.tsv new file mode 100644 index 0000000000..08378713a6 --- /dev/null +++ b/test/BaselineOutput/Common/Categorical/oneHot.tsv @@ -0,0 +1,31 @@ +#@ TextLoader{ +#@ header+ +#@ sep=tab +#@ col=A:I4:0 +#@ col=B:I4:1-2 +#@ col=C:I4:3-** +#@ col={name=CatA type=U4 src={ min=-1} key=2} +#@ col={name=CatA src={ min=-1 max=0 vector=+}} +#@ col={name=CatB type=U4 src={ min=-1} key=2} +#@ col={name=CatB src={ min=-1 max=1 vector=+}} +#@ col={name=CatC type=U4 src={ min=-1} key=2} +#@ col={name=CatC src={ min=-1 max=0 vector=+}} +#@ col={name=CatD type=U4 src={ min=-1} key=2} +#@ col={name=CatVA type=U4 src={ min=-1 max=0 vector=+} key=3} +#@ col={name=CatVA src={ min=-1 max=1 vector=+}} +#@ col={name=CatVB type=U4 src={ min=-1 max=0 vector=+} key=3} +#@ col={name=CatVB src={ min=-1 max=4 vector=+}} +#@ col={name=CatVC type=U4 src={ min=-1 max=0 vector=+} key=3} +#@ col={name=CatVC src={ min=-1 max=4 vector=+}} +#@ col={name=CatVD type=U4 src={ min=-1 max=0 vector=+} key=3} +#@ col={name=CatVVA type=U4 src={ min=-1 var=+} key=3} +#@ col={name=CatVVA src={ min=-1 max=1 vector=+}} +#@ col={name=CatVVB type=U4 src={ min=-1 var=+} key=3} +#@ col={name=CatVVB src={ min=-1 var=+}} +#@ col={name=CatVVC type=U4 src={ min=-1 var=+} key=3} +#@ col={name=CatVVC src={ min=-1 var=+}} +#@ col={name=CatVVD type=U4 src={ min=-1 var=+} key=3} +#@ } +A "" "" CatA 1 4 CatB Bit2 Bit1 Bit0 CatC 1 4 CatD "" "" 2 3 4 "" "" [0].Bit2 [0].Bit1 [0].Bit0 [1].Bit2 [1].Bit1 [1].Bit0 "" "" [0].2 [0].3 [0].4 [1].2 [1].3 [1].4 "" "" 3 4 2 +1 2 3 3 4 0 1 0 0 0 0 0 0 1 0 0 0 1 1 1 0 0 1 0 0 0 0 0 1 0 1 1 0 0 0 1 0 0 1 0 1 1 1 0 0 1 0 0 0 0 0 1 0 1 1 0 0 0 1 0 0 1 +4 2 4 2 4 3 1 0 1 1 0 0 1 1 0 1 1 0 2 1 0 1 0 2 0 0 0 0 1 0 0 2 1 0 0 0 0 1 0 2 2 1 0 1 1 1 2 1 0 0 1 0 0 0 1 0 0 0 2 1 0 0 0 1 0 1 0 1 0 0 2 1 0 diff --git a/test/BaselineOutput/SingleDebug/CategoricalHash/featurized.tsv b/test/BaselineOutput/Common/CategoricalHash/featurized.tsv similarity index 100% rename from test/BaselineOutput/SingleDebug/CategoricalHash/featurized.tsv rename to test/BaselineOutput/Common/CategoricalHash/featurized.tsv diff --git a/test/BaselineOutput/Common/CategoricalHash/oneHotHash.tsv b/test/BaselineOutput/Common/CategoricalHash/oneHotHash.tsv new file mode 100644 index 0000000000..ae36d2ad2c --- /dev/null +++ b/test/BaselineOutput/Common/CategoricalHash/oneHotHash.tsv @@ -0,0 +1,31 @@ +#@ TextLoader{ +#@ header+ +#@ sep=tab +#@ col=A:TX:0 +#@ col=B:TX:1-2 +#@ col=C:TX:3-** +#@ col={name=CatA type=U4 src={ min=-1} key=65536} +#@ col={name=CatA src={ min=-1 max=65534 vector=+}} +#@ col={name=CatB type=U4 src={ min=-1} key=65536} +#@ col={name=CatB src={ min=-1 max=16 vector=+}} +#@ col={name=CatC type=U4 src={ min=-1} key=65536} +#@ col={name=CatC src={ min=-1 max=65534 vector=+}} +#@ col={name=CatD type=U4 src={ min=-1} key=65536} +#@ col={name=CatVA type=U4 src={ min=-1 max=0 vector=+} key=65536} +#@ col={name=CatVA src={ min=-1 max=65534 vector=+}} +#@ col={name=CatVB type=U4 src={ min=-1 max=0 vector=+} key=65536} +#@ col={name=CatVB src={ min=-1 max=34 vector=+}} +#@ col={name=CatVC type=U4 src={ min=-1 max=0 vector=+} key=65536} +#@ col={name=CatVC src={ min=-1 max=131070 vector=+}} +#@ col={name=CatVD type=U4 src={ min=-1 max=0 vector=+} key=65536} +#@ col={name=CatVVA type=U4 src={ min=-1 var=+} key=65536} +#@ col={name=CatVVA src={ min=-1 max=65534 vector=+}} +#@ col={name=CatVVB type=U4 src={ min=-1 var=+} key=65536} +#@ col={name=CatVVB src={ min=-1 var=+}} +#@ col={name=CatVVC type=U4 src={ min=-1 var=+} key=65536} +#@ col={name=CatVVC src={ min=-1 var=+}} +#@ col={name=CatVVD type=U4 src={ min=-1 var=+} key=65536} +#@ } +A 393284 2:CatA 65539:CatB 65558:CatC 131095:CatD +1 2 3 2 3 4 17369 589955 17369:1 65536:17369 65540:1 65545:1 65546:1 65547:1 65548:1 65550:1 65551:1 65554:1 65555:17369 82925:1 131092:17369 131093:45477 131094:61578 176572:1 192673:1 196631:45477 196632:61578 196635:1 196637:1 196638:1 196642:1 196643:1 196645:1 196648:1 196650:1 196653:1 196654:1 196655:1 196656:1 196661:1 196665:1 196667:1 196669:45477 196670:61578 242148:1 323785:1 327743:45477 327744:61578 327745:45477 327746:61578 327747:39452 367200:1 373225:1 389326:1 393284:45477 393285:61578 393286:39452 393289:1 393291:1 393292:1 393296:1 393297:1 393299:1 393302:1 393304:1 393307:1 393308:1 393309:1 393310:1 393315:1 393319:1 393321:1 393325:1 393328:1 393329:1 393331:1 393336:1 393337:1 393338:1 393341:45477 393342:61578 393343:39452 438821:1 520458:1 563868:1 589952:45477 589953:61578 589954:39452 +4 4 5 3 4 5 20750 589955 20750:1 65536:20750 65540:1 65542:1 65546:1 65551:1 65552:1 65553:1 65555:20750 86306:1 131092:20750 131093:20750 131094:23709 151845:1 154804:1 196631:20750 196632:23709 196636:1 196638:1 196642:1 196647:1 196648:1 196649:1 196654:1 196656:1 196657:1 196658:1 196661:1 196664:1 196665:1 196666:1 196668:1 196669:20750 196670:23709 217421:1 285916:1 327743:20750 327744:23709 327745:47483 327746:61549 327747:22463 350211:1 375231:1 389297:1 393284:47483 393285:61549 393286:22463 393289:1 393291:1 393292:1 393293:1 393296:1 393298:1 393299:1 393300:1 393301:1 393303:1 393304:1 393307:1 393308:1 393309:1 393310:1 393316:1 393317:1 393319:1 393320:1 393322:1 393326:1 393328:1 393330:1 393331:1 393332:1 393333:1 393335:1 393336:1 393337:1 393338:1 393339:1 393340:1 393341:47483 393342:61549 393343:22463 440827:1 520429:1 546879:1 589952:47483 589953:61549 589954:22463 diff --git a/test/BaselineOutput/SingleRelease/Categorical/featurized.tsv b/test/BaselineOutput/SingleRelease/Categorical/featurized.tsv deleted file mode 100644 index 99a6cef1c5..0000000000 --- a/test/BaselineOutput/SingleRelease/Categorical/featurized.tsv +++ /dev/null @@ -1,14 +0,0 @@ -#@ TextLoader{ -#@ header+ -#@ sep=tab -#@ col=A:R4:0-9 -#@ col=B:R4:10-49 -#@ col=C:R4:50-59 -#@ col=D:R4:60-64 -#@ col=E:R4:65-84 -#@ } -5 3 6 4 8 1 2 7 10 9 [0].5 [0].1 [0].4 [0].3 [0].6 [0].8 [0].10 [0].2 [0].7 [0].9 [1].5 [1].1 [1].4 [1].3 [1].6 [1].8 [1].10 [1].2 [1].7 [1].9 [2].5 [2].1 [2].4 [2].3 [2].6 [2].8 [2].10 [2].2 [2].7 [2].9 [3].5 [3].1 [3].4 [3].3 [3].6 [3].8 [3].10 [3].2 [3].7 [3].9 5 1 4 3 6 8 10 2 7 9 Bit4 Bit3 Bit2 Bit1 Bit0 [0].Bit4 [0].Bit3 [0].Bit2 [0].Bit1 [0].Bit0 [1].Bit4 [1].Bit3 [1].Bit2 [1].Bit1 [1].Bit0 [2].Bit4 [2].Bit3 [2].Bit2 [2].Bit1 [2].Bit0 [3].Bit4 [3].Bit3 [3].Bit2 [3].Bit1 [3].Bit0 -85 0:1 10:1 21:1 31:1 41:1 50:1 51:3 74:1 79:1 84:1 -85 0:1 10:1 22:1 32:1 40:1 50:2 52:2 73:1 78:1 -85 1:1 13:1 21:1 31:1 41:1 51:3 53:1 64:1 68:1 69:1 74:1 79:1 84:1 -85 2:1 14:1 25:1 35:1 41:1 51:1 54:1 55:2 63:1 67:1 72:1 74:1 77:1 79:1 84:1 diff --git a/test/BaselineOutput/SingleRelease/CategoricalHash/featurized.tsv b/test/BaselineOutput/SingleRelease/CategoricalHash/featurized.tsv deleted file mode 100644 index 4bd60fd39a..0000000000 --- a/test/BaselineOutput/SingleRelease/CategoricalHash/featurized.tsv +++ /dev/null @@ -1,13 +0,0 @@ -#@ TextLoader{ -#@ sep=tab -#@ col=A:R4:0-65535 -#@ col=B:R4:65536-327679 -#@ col=C:R4:327680-393215 -#@ col=D:R4:393216-393233 -#@ col=E:R4:393234-393305 -#@ col=F:R4:393306-458841 -#@ } -458842 11529:1 77065:1 165873:1 196777:1 326564:1 327849:1 339209:1 362481:1 392100:1 393220:1 393222:1 393223:1 393225:1 393230:1 393233:1 393238:1 393240:1 393241:1 393243:1 393248:1 393251:1 393254:1 393259:1 393260:1 393261:1 393262:1 393263:1 393264:1 393265:1 393269:1 393280:1 393282:1 393284:1 393287:1 393290:1 393291:1 393292:1 393293:1 393294:1 393296:1 393297:1 393298:1 393300:1 393303:1 404835:1 -458842 11529:1 77065:1 192621:1 236060:1 323071:1 339209:1 367132:1 388607:1 389229:1 393220:1 393222:1 393223:1 393225:1 393230:1 393233:1 393238:1 393240:1 393241:1 393243:1 393248:1 393251:1 393254:1 393255:1 393256:1 393257:1 393263:1 393264:1 393266:1 393267:1 393269:1 393272:1 393275:1 393276:1 393278:1 393283:1 393284:1 393285:1 393290:1 393291:1 393292:1 393294:1 393295:1 393297:1 393298:1 393299:1 393300:1 393301:1 393302:1 393303:1 393304:1 393305:1 404835:1 -458842 47483:1 113019:1 165873:1 196777:1 326564:1 327849:1 362481:1 375163:1 392100:1 393218:1 393220:1 393221:1 393222:1 393225:1 393227:1 393228:1 393229:1 393230:1 393232:1 393233:1 393236:1 393238:1 393239:1 393240:1 393243:1 393245:1 393246:1 393247:1 393248:1 393250:1 393251:1 393254:1 393259:1 393260:1 393261:1 393262:1 393263:1 393264:1 393265:1 393269:1 393280:1 393282:1 393284:1 393287:1 393290:1 393291:1 393292:1 393293:1 393294:1 393296:1 393297:1 393298:1 393300:1 393303:1 440789:1 -458842 42588:1 108124:1 173921:1 212446:1 326564:1 343518:1 370268:1 370529:1 392100:1 393218:1 393220:1 393223:1 393224:1 393227:1 393229:1 393230:1 393231:1 393236:1 393238:1 393241:1 393242:1 393245:1 393247:1 393248:1 393249:1 393254:1 393256:1 393259:1 393260:1 393261:1 393263:1 393264:1 393269:1 393274:1 393275:1 393276:1 393277:1 393279:1 393280:1 393281:1 393283:1 393284:1 393285:1 393286:1 393290:1 393291:1 393292:1 393293:1 393294:1 393296:1 393297:1 393298:1 393300:1 393303:1 435894:1 diff --git a/test/Microsoft.ML.Tests/Transformers/CategoricalHashTests.cs b/test/Microsoft.ML.Tests/Transformers/CategoricalHashTests.cs index 4d3620d5ff..48c344dfb4 100644 --- a/test/Microsoft.ML.Tests/Transformers/CategoricalHashTests.cs +++ b/test/Microsoft.ML.Tests/Transformers/CategoricalHashTests.cs @@ -25,8 +25,9 @@ public CategoricalHashTests(ITestOutputHelper output) : base(output) private class TestClass { public string A; - public string B; - public string C; + [VectorType(2)] + public string[] B; + public string[] C; } private class TestMeta @@ -45,17 +46,31 @@ private class TestMeta [Fact] public void CategoricalHashWorkout() { - var data = new[] { new TestClass() { A = "1", B = "2", C = "3", }, new TestClass() { A = "4", B = "5", C = "6" } }; + var data = new[] { new TestClass() { A = "1", B = new[] { "2", "3" }, C = new[] { "2", "3", "4" } }, new TestClass() { A = "4", B = new[] { "4", "5" }, C = new[] { "3", "4", "5" } } }; var dataView = ML.Data.LoadFromEnumerable(data); var pipe = ML.Transforms.Categorical.OneHotHashEncoding(new[]{ - new OneHotHashEncodingEstimator.ColumnOptions("CatA", "A", OneHotEncodingEstimator.OutputKind.Bag), + new OneHotHashEncodingEstimator.ColumnOptions("CatA", "A", OneHotEncodingEstimator.OutputKind.Bag), new OneHotHashEncodingEstimator.ColumnOptions("CatB", "A", OneHotEncodingEstimator.OutputKind.Binary), new OneHotHashEncodingEstimator.ColumnOptions("CatC", "A", OneHotEncodingEstimator.OutputKind.Indicator), new OneHotHashEncodingEstimator.ColumnOptions("CatD", "A", OneHotEncodingEstimator.OutputKind.Key), + new OneHotHashEncodingEstimator.ColumnOptions("CatVA", "B", OneHotEncodingEstimator.OutputKind.Bag), + new OneHotHashEncodingEstimator.ColumnOptions("CatVB", "B", OneHotEncodingEstimator.OutputKind.Binary), + new OneHotHashEncodingEstimator.ColumnOptions("CatVC", "B", OneHotEncodingEstimator.OutputKind.Indicator), + new OneHotHashEncodingEstimator.ColumnOptions("CatVD", "B", OneHotEncodingEstimator.OutputKind.Key), + new OneHotHashEncodingEstimator.ColumnOptions("CatVVA", "C", OneHotEncodingEstimator.OutputKind.Bag), + new OneHotHashEncodingEstimator.ColumnOptions("CatVVB", "C", OneHotEncodingEstimator.OutputKind.Binary), + new OneHotHashEncodingEstimator.ColumnOptions("CatVVC", "C", OneHotEncodingEstimator.OutputKind.Indicator), + new OneHotHashEncodingEstimator.ColumnOptions("CatVVD", "C", OneHotEncodingEstimator.OutputKind.Key), }); TestEstimatorCore(pipe, dataView); + var outputPath = GetOutputPath("CategoricalHash", "oneHotHash.tsv"); + var savedData = pipe.Fit(dataView).Transform(dataView); + + using (var fs = File.Create(outputPath)) + ML.Data.SaveAsText(savedData, fs, headerRow: true, keepHidden: true); + CheckEquality("CategoricalHash", "oneHotHash.tsv"); Done(); } @@ -68,7 +83,7 @@ public void CategoricalHashStatic() VectorString: ctx.LoadText(1, 4), SingleVectorString: ctx.LoadText(1, 1))); var data = reader.Load(dataPath); - var wrongCollection = new[] { new TestClass() { A = "1", B = "2", C = "3", }, new TestClass() { A = "4", B = "5", C = "6" } }; + var wrongCollection = new[] { new TestClass() { A = "1", B = new[] { "2", "3" }, C = new[] { "2", "3", "4" } }, new TestClass() { A = "4", B = new[] { "4", "5" }, C = new[] { "3", "4", "5" } } }; var invalidData = ML.Data.LoadFromEnumerable(wrongCollection); var est = data.MakeNewEstimator(). @@ -211,12 +226,12 @@ public void TestCommandLine() [Fact] public void TestOldSavingAndLoading() { - var data = new[] { new TestClass() { A = "1", B = "2", C = "3", }, new TestClass() { A = "4", B = "5", C = "6" } }; + var data = new[] { new TestClass() { A = "1", B = new[] { "2", "3" }, C = new[] { "2", "3", "4" } }, new TestClass() { A = "4", B = new[] { "4", "5" }, C = new[] { "3", "4", "5" } } }; var dataView = ML.Data.LoadFromEnumerable(data); var pipe = ML.Transforms.Categorical.OneHotHashEncoding(new[]{ new OneHotHashEncodingEstimator.ColumnOptions("CatHashA", "A"), new OneHotHashEncodingEstimator.ColumnOptions("CatHashB", "B"), - new OneHotHashEncodingEstimator.ColumnOptions("CatHashC", "C") + new OneHotHashEncodingEstimator.ColumnOptions("CatHashC", "C"), }); var result = pipe.Fit(dataView).Transform(dataView); var resultRoles = new RoleMappedData(result); diff --git a/test/Microsoft.ML.Tests/Transformers/CategoricalTests.cs b/test/Microsoft.ML.Tests/Transformers/CategoricalTests.cs index 93789f9a60..e5efd9e646 100644 --- a/test/Microsoft.ML.Tests/Transformers/CategoricalTests.cs +++ b/test/Microsoft.ML.Tests/Transformers/CategoricalTests.cs @@ -25,8 +25,10 @@ public CategoricalTests(ITestOutputHelper output) : base(output) private sealed class TestClass { public int A; - public int B; - public int C; + [VectorType(2)] + public int[] B; + public int[] C; + } private sealed class TestClassWithLabel @@ -60,7 +62,10 @@ private sealed class TestStringClass [Fact] public void CategoricalWorkout() { - var data = new[] { new TestClass() { A = 1, B = 2, C = 3, }, new TestClass() { A = 4, B = 5, C = 6 } }; + var data = new[] { + new TestClass() { A = 1, B = new int[2] { 2, 3 }, C = new int[2] { 3, 4 } }, + new TestClass() { A = 4, B = new int[2] { 2, 4 }, C = new int[3] { 2, 4, 3 } } + }; var dataView = ML.Data.LoadFromEnumerable(data); var pipe = ML.Transforms.Categorical.OneHotEncoding(new[]{ @@ -68,43 +73,23 @@ public void CategoricalWorkout() new OneHotEncodingEstimator.ColumnOptions("CatB", "A", OneHotEncodingEstimator.OutputKind.Binary), new OneHotEncodingEstimator.ColumnOptions("CatC", "A", OneHotEncodingEstimator.OutputKind.Indicator), new OneHotEncodingEstimator.ColumnOptions("CatD", "A", OneHotEncodingEstimator.OutputKind.Key), + new OneHotEncodingEstimator.ColumnOptions("CatVA", "B", OneHotEncodingEstimator.OutputKind.Bag), + new OneHotEncodingEstimator.ColumnOptions("CatVB", "B", OneHotEncodingEstimator.OutputKind.Binary), + new OneHotEncodingEstimator.ColumnOptions("CatVC", "B", OneHotEncodingEstimator.OutputKind.Indicator), + new OneHotEncodingEstimator.ColumnOptions("CatVD", "B", OneHotEncodingEstimator.OutputKind.Key), + new OneHotEncodingEstimator.ColumnOptions("CatVVA", "C", OneHotEncodingEstimator.OutputKind.Bag), + new OneHotEncodingEstimator.ColumnOptions("CatVVB", "C", OneHotEncodingEstimator.OutputKind.Binary), + new OneHotEncodingEstimator.ColumnOptions("CatVVC", "C", OneHotEncodingEstimator.OutputKind.Indicator), + new OneHotEncodingEstimator.ColumnOptions("CatVVD", "C", OneHotEncodingEstimator.OutputKind.Key), }); TestEstimatorCore(pipe, dataView); - Done(); - } - - [Fact] - public void CategoricalOneHotHashEncoding() - { - var data = new[] { new TestClass() { A = 1, B = 2, C = 3, }, new TestClass() { A = 4, B = 5, C = 6 } }; - - var mlContext = new MLContext(); - var dataView = mlContext.Data.LoadFromEnumerable(data); - - var pipe = mlContext.Transforms.Categorical.OneHotHashEncoding("CatA", "A", OneHotEncodingEstimator.OutputKind.Bag, 3, 0) - .Append(mlContext.Transforms.Categorical.OneHotHashEncoding("CatB", "A", OneHotEncodingEstimator.OutputKind.Key, 2, 0)) - .Append(mlContext.Transforms.Categorical.OneHotHashEncoding("CatC", "A", OneHotEncodingEstimator.OutputKind.Indicator, 3, 0)) - .Append(mlContext.Transforms.Categorical.OneHotHashEncoding("CatD", "A", OneHotEncodingEstimator.OutputKind.Binary, 2, 0)); - - TestEstimatorCore(pipe, dataView); - Done(); - } - - [Fact] - public void CategoricalOneHotEncoding() - { - var data = new[] { new TestClass() { A = 1, B = 2, C = 3, }, new TestClass() { A = 4, B = 5, C = 6 } }; + var outputPath = GetOutputPath("Categorical", "oneHot.tsv"); + var savedData = pipe.Fit(dataView).Transform(dataView); - var mlContext = new MLContext(); - var dataView = mlContext.Data.LoadFromEnumerable(data); - - var pipe = mlContext.Transforms.Categorical.OneHotEncoding("CatA", "A", OneHotEncodingEstimator.OutputKind.Bag) - .Append(mlContext.Transforms.Categorical.OneHotEncoding("CatB", "A", OneHotEncodingEstimator.OutputKind.Key)) - .Append(mlContext.Transforms.Categorical.OneHotEncoding("CatC", "A", OneHotEncodingEstimator.OutputKind.Indicator)) - .Append(mlContext.Transforms.Categorical.OneHotEncoding("CatD", "A", OneHotEncodingEstimator.OutputKind.Binary)); - - TestEstimatorCore(pipe, dataView); + using (var fs = File.Create(outputPath)) + ML.Data.SaveAsText(savedData, fs, headerRow: true, keepHidden: true); + CheckEquality("Categorical", "oneHot.tsv"); Done(); } @@ -173,7 +158,7 @@ public void CategoricalStatic() ScalarString: ctx.LoadText(1), VectorString: ctx.LoadText(1, 4))); var data = reader.Load(dataPath); - var wrongCollection = new[] { new TestClass() { A = 1, B = 2, C = 3, }, new TestClass() { A = 4, B = 5, C = 6 } }; + var wrongCollection = new[] { new TestClass() { A = 1, B = new int[2] { 2, 3 } }, new TestClass() { A = 4, B = new int[2] { 2, 4 } } }; var invalidData = ML.Data.LoadFromEnumerable(wrongCollection); var est = data.MakeNewEstimator(). @@ -328,12 +313,15 @@ public void TestCommandLine() [Fact] public void TestOldSavingAndLoading() { - var data = new[] { new TestClass() { A = 1, B = 2, C = 3, }, new TestClass() { A = 4, B = 5, C = 6 } }; + var data = new[] { + new TestClass() { A = 1, B = new int[2] { 2, 3 }, C = new int[2] { 3, 4 } }, + new TestClass() { A = 4, B = new int[2] { 2, 4 }, C = new int[3] { 2, 4, 3 } } + }; var dataView = ML.Data.LoadFromEnumerable(data); var pipe = ML.Transforms.Categorical.OneHotEncoding(new[]{ - new InputOutputColumnPair("TermA", "A"), - new InputOutputColumnPair("TermB", "B"), - new InputOutputColumnPair("TermC", "C") + new OneHotEncodingEstimator.ColumnOptions("CatA", "A"), + new OneHotEncodingEstimator.ColumnOptions("CatB", "B"), + new OneHotEncodingEstimator.ColumnOptions("CatC", "C") }); var result = pipe.Fit(dataView).Transform(dataView); var resultRoles = new RoleMappedData(result);