Skip to content

Commit 64523e8

Browse files
authored
Add text normalizer transformer to AutoML (#6998)
* Add text normalizer transformer to AutoML * clean * clean * Add default * Follow default pattern of ML.Net
1 parent e3a06e3 commit 64523e8

File tree

10 files changed

+126
-8
lines changed

10 files changed

+126
-8
lines changed

src/Microsoft.ML.AutoML/CodeGen/estimator-schema.json

+9-3
Original file line numberDiff line numberDiff line change
@@ -75,7 +75,8 @@
7575
"SentenceSimilarity",
7676
"ObjectDetection",
7777
"QuestionAnswering",
78-
"NamedEntityRecognition"
78+
"NamedEntityRecognition",
79+
"NormalizeText"
7980
]
8081
},
8182
"nugetDependencies": {
@@ -114,7 +115,8 @@
114115
"Microsoft.ML.Transforms.Image",
115116
"Microsoft.ML.Trainers.FastTree",
116117
"Microsoft.ML.TorchSharp",
117-
"Microsoft.ML.Trainers.LightGbm"
118+
"Microsoft.ML.Trainers.LightGbm",
119+
"Microsoft.ML.Transforms.Text.TextNormalizingEstimator"
118120
]
119121
}
120122
},
@@ -198,7 +200,11 @@
198200
"scoreThreshold",
199201
"steps",
200202
"initLearningRate",
201-
"weightDecay"
203+
"weightDecay",
204+
"mode",
205+
"keepPunctuations",
206+
"keepDiacritics",
207+
"keepNumbers"
202208
]
203209
},
204210
"argumentType": {
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,34 @@
1+
{
2+
"$schema": "./search-space-schema.json#",
3+
"name": "normalize_text_option",
4+
"search_space": [
5+
{
6+
"name": "InputColumnName",
7+
"type": "string"
8+
},
9+
{
10+
"name": "OutputColumnName",
11+
"type": "string"
12+
},
13+
{
14+
"name": "Mode",
15+
"type": "caseMode",
16+
"default": "CaseMode.Lower"
17+
},
18+
{
19+
"name": "KeepDiacritics",
20+
"type": "boolean",
21+
"default": false
22+
},
23+
{
24+
"name": "KeepPunctuations",
25+
"type": "boolean",
26+
"default": true
27+
},
28+
{
29+
"name": "KeepNumbers",
30+
"type": "boolean",
31+
"default": true
32+
}
33+
]
34+
}

src/Microsoft.ML.AutoML/CodeGen/search-space-schema.json

+18-4
Original file line numberDiff line numberDiff line change
@@ -66,6 +66,14 @@
6666
"DataKind.DateTimeOffset"
6767
]
6868
},
69+
"caseMode": {
70+
"type": "string",
71+
"enum": [
72+
"CaseMode.Lower",
73+
"CaseMode.Upper",
74+
"CaseMode.None"
75+
]
76+
},
6977
"bertArchitectureArray": {
7078
"type": "array",
7179
"items": {
@@ -90,7 +98,7 @@
9098
"$ref": "#/definitions/dnnModelFactoryArray"
9199
},
92100
{
93-
"$ref": "#/definitions/imageClassificationArchArray"
101+
"$ref": "#/definitions/imageClassificationArchArray"
94102
},
95103
{
96104
"$ref": "#/definitions/boolArray"
@@ -168,7 +176,8 @@
168176
"sentence_similarity_option",
169177
"object_detection_option",
170178
"question_answering_option",
171-
"named_entity_recognition_option"
179+
"named_entity_recognition_option",
180+
"normalize_text_option"
172181
]
173182
},
174183
"option_name": {
@@ -241,7 +250,11 @@
241250
"TopKAnswers",
242251
"TargetType",
243252
"PredictionColumnName",
244-
"KeyData"
253+
"KeyData",
254+
"Mode",
255+
"KeepPunctuations",
256+
"KeepDiacritics",
257+
"KeepNumbers"
245258
]
246259
},
247260
"option_type": {
@@ -261,7 +274,8 @@
261274
"bertArchitecture",
262275
"imageClassificationArchType",
263276
"dataKind",
264-
"dataView"
277+
"dataView",
278+
"caseMode"
265279
]
266280
}
267281
},

src/Microsoft.ML.AutoML/CodeGen/transformer-estimators.json

+33
Original file line numberDiff line numberDiff line change
@@ -180,6 +180,39 @@
180180
"usingStatements": [ "Microsoft.ML", "Microsoft.ML.Data" ],
181181
"searchOption": "featurize_text_option"
182182
},
183+
{
184+
"functionName": "NormalizeText",
185+
"estimatorTypes": [ "Text" ],
186+
"arguments": [
187+
{
188+
"argumentName": "outputColumnName",
189+
"argumentType": "string"
190+
},
191+
{
192+
"argumentName": "inputColumnName",
193+
"argumentType": "string"
194+
},
195+
{
196+
"argumentName": "mode",
197+
"argumentType": "caseMode"
198+
},
199+
{
200+
"argumentName": "keepDiacritics",
201+
"argumentType": "boolean"
202+
},
203+
{
204+
"argumentName": "keepPunctuations",
205+
"argumentType": "boolean"
206+
},
207+
{
208+
"argumentName": "keepNumbers",
209+
"argumentType": "boolean"
210+
}
211+
],
212+
"nugetDependencies": [ "Microsoft.ML" ],
213+
"usingStatements": [ "Microsoft.ML", "Microsoft.ML.Data", "Microsoft.ML.Transforms.Text.TextNormalizingEstimator"],
214+
"searchOption": "normalize_text_option"
215+
},
183216
{
184217
"functionName": "ConvertType",
185218
"estimatorTypes": [ "Conversion" ],

src/Microsoft.ML.AutoML/CodeGen/type_converter_search_space.json

+4
Original file line numberDiff line numberDiff line change
@@ -18,6 +18,10 @@
1818
{
1919
"name": "KeyData",
2020
"type": "dataView"
21+
},
22+
{
23+
"name": "Mode",
24+
"type": "caseMode"
2125
}
2226
]
2327
}

src/Microsoft.ML.AutoML/Microsoft.ML.AutoML.csproj

+4-1
Original file line numberDiff line numberDiff line change
@@ -69,6 +69,9 @@
6969
<AdditionalFiles Include="CodeGen\code_gen_flag.json" />
7070
<AdditionalFiles Include="CodeGen\*-estimators.json" />
7171
</ItemGroup>
72+
<ItemGroup>
73+
<None Remove="CodeGen\normalize_text_search_space.json" />
74+
</ItemGroup>
7275

7376
<ItemGroup>
7477
<EmbeddedResource Include="Tuner\Portfolios.json">
@@ -86,7 +89,7 @@
8689
<!--The path needed to be hardcoded for this to work on our publishing CI-->
8790
<BuildOutputInPackage Condition="Exists('$(PackageAssetsPath)$(PackageIdFolderName)\runtimes\win-x86\native\LdaNative.pdb')" Include="$(PackageAssetsPath)$(PackageIdFolderName)\runtimes\win-x86\native\LdaNative.pdb" TargetPath="..\..\runtimes\win-x86\native" />
8891
<BuildOutputInPackage Condition="Exists('$(PackageAssetsPath)$(PackageIdFolderName)\runtimes\win-x64\native\LdaNative.pdb')" Include="$(PackageAssetsPath)$(PackageIdFolderName)\runtimes\win-x64\native\LdaNative.pdb" TargetPath="..\..\runtimes\win-x64\native" />
89-
<BuildOutputInPackage Condition="Exists('$(PackageAssetsPath)$(PackageIdFolderName)\runtimes\win-arm64\native\LdaNative.pdb')" Include="$(PackageAssetsPath)$(PackageIdFolderName)\runtimes\win-arm64\native\LdaNative.pdb" TargetPath="..\..\runtimes\win-arm64\native"/>
92+
<BuildOutputInPackage Condition="Exists('$(PackageAssetsPath)$(PackageIdFolderName)\runtimes\win-arm64\native\LdaNative.pdb')" Include="$(PackageAssetsPath)$(PackageIdFolderName)\runtimes\win-arm64\native\LdaNative.pdb" TargetPath="..\..\runtimes\win-arm64\native" />
9093
</ItemGroup>
9194
</Target>
9295

Original file line numberDiff line numberDiff line change
@@ -0,0 +1,18 @@
1+
// Licensed to the .NET Foundation under one or more agreements.
2+
// The .NET Foundation licenses this file to you under the MIT license.
3+
// See the LICENSE file in the project root for more information.
4+
5+
using System;
6+
using System.Collections.Generic;
7+
using System.Text;
8+
9+
namespace Microsoft.ML.AutoML.CodeGen
10+
{
11+
internal partial class NormalizeText
12+
{
13+
public override IEstimator<ITransformer> BuildFromOption(MLContext context, NormalizeTextOption param)
14+
{
15+
return context.Transforms.Text.NormalizeText(param.OutputColumnName, param.InputColumnName, param.Mode, param.KeepDiacritics, param.KeepPunctuations, param.KeepNumbers);
16+
}
17+
}
18+
}

tools-local/Microsoft.ML.AutoML.SourceGenerator/SearchSpaceGenerator.cs

+2
Original file line numberDiff line numberDiff line change
@@ -58,6 +58,7 @@ public void Execute(GeneratorExecutionContext context)
5858
"imageClassificationArchType" => "Microsoft.ML.Vision.ImageClassificationTrainer.Architecture",
5959
"dataKind" => "Microsoft.ML.Data.DataKind",
6060
"dataView" => "Microsoft.ML.IDataView",
61+
"caseMode" => "Microsoft.ML.Transforms.Text.TextNormalizingEstimator.CaseMode",
6162
_ => throw new ArgumentException("unknown type"),
6263
};
6364

@@ -78,6 +79,7 @@ public void Execute(GeneratorExecutionContext context)
7879
(_, "Microsoft.ML.Vision.ImageClassificationTrainer.Architecture") => defaultToken.GetValue<string>(),
7980
(_, "Microsoft.ML.Data.DataKind") => defaultToken.GetValue<string>(),
8081
(_, "Microsoft.ML.IDataView") => defaultToken.GetValue<string>(),
82+
(_, "Microsoft.ML.Transforms.Text.TextNormalizingEstimator.CaseMode") => defaultToken.GetValue<string>(),
8183
(_, _) => throw new ArgumentException("unknown"),
8284
};
8385

tools-local/Microsoft.ML.AutoML.SourceGenerator/Template/SearchSpace.cs

+2
Original file line numberDiff line numberDiff line change
@@ -35,6 +35,8 @@ public virtual string TransformText()
3535
using BertArchitecture = Microsoft.ML.TorchSharp.NasBert.BertArchitecture;
3636
using static Microsoft.ML.Vision.ImageClassificationTrainer.Architecture;
3737
using DataKind = Microsoft.ML.Data.DataKind;
38+
using CaseMode = Microsoft.ML.Transforms.Text.TextNormalizingEstimator.CaseMode;
39+
3840
#nullable enable
3941
4042
namespace ");

tools-local/Microsoft.ML.AutoML.SourceGenerator/Template/SearchSpace.tt

+2
Original file line numberDiff line numberDiff line change
@@ -13,6 +13,8 @@ using Anchor = Microsoft.ML.Transforms.Image.ImageResizingEstimator.Anchor;
1313
using BertArchitecture = Microsoft.ML.TorchSharp.NasBert.BertArchitecture;
1414
using static Microsoft.ML.Vision.ImageClassificationTrainer.Architecture;
1515
using DataKind = Microsoft.ML.Data.DataKind;
16+
using CaseMode = Microsoft.ML.Transforms.Text.TextNormalizingEstimator.CaseMode;
17+
1618
#nullable enable
1719

1820
namespace <#=NameSpace#>

0 commit comments

Comments
 (0)