dotnet · Lynx1820 · Jul 11, 2020 · Jul 2, 2020 · Jul 2, 2020 · Jul 10, 2020
diff --git a/src/Microsoft.ML.Transforms/Text/StopWordsRemovingTransformer.cs b/src/Microsoft.ML.Transforms/Text/StopWordsRemovingTransformer.cs
@@ -15,6 +15,7 @@
 using Microsoft.ML.Data.IO;
 using Microsoft.ML.EntryPoints;
 using Microsoft.ML.Internal.Utilities;
+using Microsoft.ML.Model.OnnxConverter;
 using Microsoft.ML.Runtime;
 using Microsoft.ML.Transforms.Text;
 
@@ -343,14 +344,16 @@ private static Stream GetResourceFileStreamOrNull(StopWordsRemovingEstimator.Lan
             return assembly.GetManifestResourceStream($"{assembly.GetName().Name}.Text.StopWords.{lang.ToString()}.txt");
         }
 
-        private sealed class Mapper : MapperBase
+        private sealed class Mapper : MapperBase, ISaveAsOnnx
         {
             private readonly DataViewType[] _types;
             private readonly StopWordsRemovingTransformer _parent;
             private readonly int[] _languageColumns;
             private readonly bool?[] _resourcesExist;
             private readonly Dictionary<int, int> _colMapNewToOld;
 
+            public bool CanSaveOnnx(OnnxContext ctx) => true;
+
             public Mapper(StopWordsRemovingTransformer parent, DataViewSchema inputSchema)
              : base(Contracts.CheckRef(parent, nameof(parent)).Host.Register(nameof(Mapper)), inputSchema, parent)
             {
@@ -438,6 +441,45 @@ protected override Delegate MakeGetter(DataViewRow input, int iinfo, Func<int, b
                 return del;
             }
 
+            public void SaveAsOnnx(OnnxContext ctx)
+            {
+                const int minimumOpSetVersion = 9;
+                ctx.CheckOpSetVersion(minimumOpSetVersion, LoaderSignature);
+
+                for (int i = 0; i < _parent.ColumnPairs.Length; i++)
+                {
+                    var srcVariableName = ctx.GetVariableName(_parent.ColumnPairs[i].inputColumnName);
+                    if (!ctx.ContainsColumn(srcVariableName))
+                        continue;
+                    var dstVariableName = ctx.AddIntermediateVariable(_types[i], _parent.ColumnPairs[i].outputColumnName);
+                    SaveAsOnnxCore(ctx, i, srcVariableName, dstVariableName);
+                }
+            }
+
+            private void SaveAsOnnxCore(OnnxContext ctx, int iinfo, string srcVariableName, string dstVariableName)
+            {
+                var opType = "Squeeze";
+                var squeezeOutput = ctx.AddIntermediateVariable(_types[iinfo], "SqueezeOutput", true);
+                var node = ctx.CreateNode(opType, srcVariableName, squeezeOutput, ctx.GetNodeName(opType), "");
+                node.AddAttribute("axes", new long[] { 0 });
+
+                opType = "StringNormalizer";
+                var stringNormalizerOutput = ctx.AddIntermediateVariable(_types[iinfo], "StringNormalizerOutput", true);
+                node = ctx.CreateNode(opType, squeezeOutput, stringNormalizerOutput, ctx.GetNodeName(opType), "");
+
+                var langToUse = _parent._columns[iinfo].Language;
+                var lang = default(ReadOnlyMemory<char>);
+                UpdateLanguage(ref langToUse, null, ref lang);
+
+                var words = StopWords[iinfo].Select(item => Convert.ToString(item.Value));
+                node.AddAttribute("stopwords", StopWords[iinfo].Select(item => Convert.ToString(item.Value)));
+
+                opType = "Unsqueeze";
+                squeezeOutput = ctx.AddIntermediateVariable(_types[iinfo], "SqueezeOutput");
+                node = ctx.CreateNode(opType, stringNormalizerOutput, dstVariableName, ctx.GetNodeName(opType), "");
+                node.AddAttribute("axes", new long[] { 0 });
+            }
+
             private void UpdateLanguage(ref StopWordsRemovingEstimator.Language langToUse, ValueGetter<ReadOnlyMemory<char>> getLang, ref ReadOnlyMemory<char> langTxt)
             {
                 if (getLang != null)
@@ -490,7 +532,7 @@ private protected override Func<int, bool> GetDependenciesCore(Func<int, bool> a
     /// | Does this estimator need to look at the data to train its parameters? | No |
     /// | Input column data type | Vector of [Text](xref:Microsoft.ML.Data.TextDataViewType) |
     /// | Output column data type | Variable-sized vector of [Text](xref:Microsoft.ML.Data.TextDataViewType) |
-    /// | Exportable to ONNX | No |
+    /// | Exportable to ONNX | Yes |
     ///
     /// The resulting <xref:Microsoft.ML.Transforms.Text.StopWordsRemovingTransformer> creates a new column, named as specified in the output column name parameter,
     /// and fills it with a vector of words containing all of the words in the input column **except the predefined list of stopwords for the specified language.
@@ -1016,11 +1058,13 @@ private static IRowMapper Create(IHostEnvironment env, ModelLoadContext ctx, Dat
 
         private protected override IRowMapper MakeRowMapper(DataViewSchema schema) => new Mapper(this, schema);
 
-        private sealed class Mapper : OneToOneMapperBase
+        private sealed class Mapper : OneToOneMapperBase, ISaveAsOnnx
         {
             private readonly DataViewType[] _types;
             private readonly CustomStopWordsRemovingTransformer _parent;
 
+            public bool CanSaveOnnx(OnnxContext ctx) => true;
+
             public Mapper(CustomStopWordsRemovingTransformer parent, DataViewSchema inputSchema)
              : base(Contracts.CheckRef(parent, nameof(parent)).Host.Register(nameof(Mapper)), parent, inputSchema)
             {
@@ -1084,6 +1128,43 @@ protected override Delegate MakeGetter(DataViewRow input, int iinfo, Func<int, b
 
                 return del;
             }
+
+            public void SaveAsOnnx(OnnxContext ctx)
+            {
+                const int minimumOpSetVersion = 9;
+                ctx.CheckOpSetVersion(minimumOpSetVersion, LoaderSignature);
+
+                for (int i = 0; i < _parent.ColumnPairs.Length; i++)
+                {
+                    var srcVariableName = ctx.GetVariableName(_parent.ColumnPairs[i].inputColumnName);
+                    if (!ctx.ContainsColumn(srcVariableName))
+                        continue;
+                    var dstVariableName = ctx.AddIntermediateVariable(_types[i], _parent.ColumnPairs[i].outputColumnName);
+
+                    SaveAsOnnxCore(ctx, i, srcVariableName, dstVariableName);
+                }
+            }
+
+            // Note: Since StringNormalizer only accepts inputs of shape [C] or [1,C], we temporarily squeeze the
+            // batch dimension which may exceed 1
+            private void SaveAsOnnxCore(OnnxContext ctx, int iinfo, string srcVariableName, string dstVariableName)
+            {
+                var opType = "Squeeze";
+                var squeezeOutput = ctx.AddIntermediateVariable(_types[iinfo], "SqueezeOutput", true);
+                var node = ctx.CreateNode(opType, srcVariableName, squeezeOutput, ctx.GetNodeName(opType), "");
+                node.AddAttribute("axes", new long[] { 0 });
+
+                opType = "StringNormalizer";
+                var stringNormalizerOutput = ctx.AddIntermediateVariable(_types[iinfo], "StringNormalizerOutput", true);
+                node = ctx.CreateNode(opType, squeezeOutput, stringNormalizerOutput, ctx.GetNodeName(opType), "");
+                var words = _parent._stopWordsMap.ToList();
+                node.AddAttribute("stopwords", words.Select(item => Convert.ToString(item.Value)));
+
+                opType = "Unsqueeze";
+                squeezeOutput = ctx.AddIntermediateVariable(_types[iinfo], "SqueezeOutput");
+                node = ctx.CreateNode(opType, stringNormalizerOutput, dstVariableName, ctx.GetNodeName(opType), "");
+                node.AddAttribute("axes", new long[] { 0 });
+            }
         }
     }
 
@@ -1098,7 +1179,7 @@ protected override Delegate MakeGetter(DataViewRow input, int iinfo, Func<int, b
     /// | Does this estimator need to look at the data to train its parameters? | No |
     /// | Input column data type | Vector of [Text](xref:Microsoft.ML.Data.TextDataViewType) |
     /// | Output column data type | Vector of [Text](xref:Microsoft.ML.Data.TextDataViewType) |
-    /// | Exportable to ONNX | No |
+    /// | Exportable to ONNX | Yes |
     ///
     /// The resulting <xref:Microsoft.ML.Transforms.Text.CustomStopWordsRemovingTransformer> creates a new column, named as specified by the output column name parameter, and
     /// fills it with a vector of words containing all of the words in the input column except those given by the stopwords parameter.

diff --git a/test/Microsoft.ML.Tests/OnnxConversionTest.cs b/test/Microsoft.ML.Tests/OnnxConversionTest.cs
@@ -974,8 +974,8 @@ public void OneHotHashEncodingOnnxConversionTest()
             var mlContext = new MLContext();
             string dataPath = GetDataPath(TestDatasets.breastCancer.trainFilename);
 
-            var dataView = ML.Data.LoadFromTextFile<BreastCancerCatFeatureExample>(dataPath);
-            var pipeline = ML.Transforms.Categorical.OneHotHashEncoding(new[]{
+            var dataView = mlContext.Data.LoadFromTextFile<BreastCancerCatFeatureExample>(dataPath);
+            var pipeline = mlContext.Transforms.Categorical.OneHotHashEncoding(new[]{
                     new OneHotHashEncodingEstimator.ColumnOptions("Output", "F3", useOrderedHashing:false),
                 });
             var onnxFileName = "OneHotHashEncoding.onnx";
@@ -1343,6 +1343,54 @@ public void NgramOnnxConversionTest(
             Done();
         }
 
+        [Fact]
+        public void CustomStopWordsRemovingEstimatorOnnxTest()
+        {
+            var mlContext = new MLContext();
+
+            var pipeline = mlContext.Transforms.Text.TokenizeIntoWords("Words", "Text")
+                .Append(mlContext.Transforms.Text.RemoveStopWords(
+                "WordsWithoutStopWords", "Words", stopwords:
+                new[] { "cat", "sat", "on" }));
+
+            var samples = new List<TextData>()
+            {
+                new TextData(){ Text = "cat sat on mat" },
+                new TextData(){ Text = "mat not fit cat" },
+                new TextData(){ Text = "a cat think mat bad" },
+            };
+            var dataView = mlContext.Data.LoadFromEnumerable(samples);
+            var onnxFileName = $"CustomStopWordsRemovingEstimator.onnx";
+
+            TestPipeline(pipeline, dataView, onnxFileName, new ColumnComparison[] { new ColumnComparison("WordsWithoutStopWords")});
+
+            Done();
+        }
+
+        [Fact]
+        public void StopWordsRemovingEstimatorOnnxTest()
+        {
+            var mlContext = new MLContext();
+
+            var pipeline = mlContext.Transforms.Text.TokenizeIntoWords("Words", "Text")
+                .Append(mlContext.Transforms.Text.RemoveDefaultStopWords(
+                "WordsWithoutStopWords", "Words", language:
+                StopWordsRemovingEstimator.Language.English));
+
+            var samples = new List<TextData>()
+            {
+                new TextData(){ Text = "a go cat sat on mat" },
+                new TextData(){ Text = "a mat not fit go cat" },
+                new TextData(){ Text = "cat think mat bad a" },
+            };
+            var dataView = mlContext.Data.LoadFromEnumerable(samples);
+            var onnxFileName = $"StopWordsRemovingEstimator.onnx";
+
+            TestPipeline(pipeline, dataView, onnxFileName, new ColumnComparison[] { new ColumnComparison("WordsWithoutStopWords") });
+
+            Done();
+        }
+
         [Theory]
         [InlineData(DataKind.Boolean)]
         [InlineData(DataKind.SByte)]