Skip to content

Polish char- and word-level tokenizers & stopword removers #2916

New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Merged
merged 7 commits into from
Mar 13, 2019
Merged
Show file tree
Hide file tree
Changes from 1 commit
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
Prev Previous commit
Next Next commit
Rename ProduceCharacterTokens to ProduceCharactersAsKeys
  • Loading branch information
wschin committed Mar 13, 2019
commit 883784a1935d50f39418f733208ea0279f942e4a
2 changes: 1 addition & 1 deletion docs/code/MlNetCookBook.md
Original file line number Diff line number Diff line change
Expand Up @@ -775,7 +775,7 @@ var pipeline =
ngramLength: 2, useAllLengths: false))

// NLP pipeline 3: bag of tri-character sequences with TF-IDF weighting.
.Append(mlContext.Transforms.Text.ProduceCharacterTokens("MessageChars", "Message"))
.Append(mlContext.Transforms.Text.ProduceCharactersAsKeys("MessageChars", "Message"))
.Append(new NgramExtractingEstimator(mlContext, "BagOfTrichar", "MessageChars",
ngramLength: 3, weighting: NgramExtractingEstimator.WeightingCriteria.TfIdf))

Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -26,7 +26,7 @@ public static void NgramTransform()
// A pipeline to tokenize text as characters and then combine them together into ngrams
// The pipeline uses the default settings to featurize.

var charsPipeline = ml.Transforms.Text.ProduceCharacterTokens("Chars", "SentimentText", useMarkerCharacters: false);
var charsPipeline = ml.Transforms.Text.ProduceCharactersAsKeys("Chars", "SentimentText", useMarkerCharacters: false);
var ngramOnePipeline = ml.Transforms.Text.ProduceNgrams("CharsUnigrams", "Chars", ngramLength: 1);
var ngramTwpPipeline = ml.Transforms.Text.ProduceNgrams("CharsTwograms", "Chars");
var oneCharsPipeline = charsPipeline.Append(ngramOnePipeline);
Expand Down
2 changes: 1 addition & 1 deletion src/Microsoft.ML.StaticPipe/TextStaticExtensions.cs
Original file line number Diff line number Diff line change
Expand Up @@ -109,7 +109,7 @@ public override IEstimator<ITransformer> Reconcile(IHostEnvironment env,
/// </summary>
/// <param name="input">The column to apply to.</param>
/// <param name="useMarkerCharacters">Whether to use marker characters to separate words.</param>
public static VarVector<Key<ushort, string>> ProduceCharacterTokens(this Scalar<string> input, bool useMarkerCharacters = true) => new OutPipelineColumn(input, useMarkerCharacters);
public static VarVector<Key<ushort, string>> ProduceCharactersAsKeys(this Scalar<string> input, bool useMarkerCharacters = true) => new OutPipelineColumn(input, useMarkerCharacters);
}

/// <summary>
Expand Down
4 changes: 2 additions & 2 deletions src/Microsoft.ML.Transforms/Text/TextCatalog.cs
Original file line number Diff line number Diff line change
Expand Up @@ -57,7 +57,7 @@ public static TextFeaturizingEstimator FeaturizeText(this TransformsCatalog.Text
/// <param name="inputColumnName">Name of the column to transform. If set to <see langword="null"/>, the value of the <paramref name="outputColumnName"/> will be used as source.</param>
/// <param name="useMarkerCharacters">Whether to prepend a marker character, <see langword="0x02"/>, to the beginning,
/// and append another marker character, <see langword="0x03"/>, to the end of the output vector of characters.</param>
public static TokenizingByCharactersEstimator ProduceCharacterTokens(this TransformsCatalog.TextTransforms catalog,
public static TokenizingByCharactersEstimator ProduceCharactersAsKeys(this TransformsCatalog.TextTransforms catalog,
string outputColumnName,
string inputColumnName = null,
bool useMarkerCharacters = CharTokenizingDefaults.UseMarkerCharacters)
Expand All @@ -72,7 +72,7 @@ public static TokenizingByCharactersEstimator ProduceCharacterTokens(this Transf
/// and append another marker character, <see langword="0x03"/>, to the end of the output vector of characters.</param>
/// <param name="columns">Pairs of columns to run the tokenization on.</param>

public static TokenizingByCharactersEstimator ProduceCharacterTokens(this TransformsCatalog.TextTransforms catalog,
public static TokenizingByCharactersEstimator ProduceCharactersAsKeys(this TransformsCatalog.TextTransforms catalog,
Copy link
Contributor

@artidoro artidoro Mar 13, 2019

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

ProduceCharactersAsKeys [](start = 54, length = 23)

See my comment on the TokenizeWords extension.
I would suggest TokenizeCharactersAsKeys as new name. #Resolved

bool useMarkerCharacters = CharTokenizingDefaults.UseMarkerCharacters,
params ColumnOptions[] columns)
=> new TokenizingByCharactersEstimator(Contracts.CheckRef(catalog, nameof(catalog)).GetEnvironment(), useMarkerCharacters, ColumnOptions.ConvertToValueTuples(columns));
Expand Down
2 changes: 1 addition & 1 deletion test/Microsoft.ML.StaticPipelineTesting/StaticPipeTests.cs
Original file line number Diff line number Diff line change
Expand Up @@ -520,7 +520,7 @@ public void Tokenize()
.Append(r => (
r.label,
tokens: r.text.ProduceWordTokens(),
chars: r.text.ProduceCharacterTokens()));
chars: r.text.ProduceCharactersAsKeys()));

var tdata = est.Fit(data).Transform(data);
var schema = tdata.AsDynamic.Schema;
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -467,7 +467,7 @@ private void TextFeaturizationOn(string dataPath)
BagOfBigrams: r.Message.NormalizeText().ProduceHashedWordBags(ngramLength: 2, useAllLengths: false),

// NLP pipeline 3: bag of tri-character sequences with TF-IDF weighting.
BagOfTrichar: r.Message.ProduceCharacterTokens().ProduceNgrams(ngramLength: 3, weighting: NgramExtractingEstimator.WeightingCriteria.TfIdf),
BagOfTrichar: r.Message.ProduceCharactersAsKeys().ProduceNgrams(ngramLength: 3, weighting: NgramExtractingEstimator.WeightingCriteria.TfIdf),

// NLP pipeline 4: word embeddings.
// PretrainedModelKind.Sswe is used here for performance of the test. In a real
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -305,7 +305,7 @@ private void TextFeaturizationOn(string dataPath)
ngramLength: 2, useAllLengths: false))

// NLP pipeline 3: bag of tri-character sequences with TF-IDF weighting.
.Append(mlContext.Transforms.Text.ProduceCharacterTokens("MessageChars", "Message"))
.Append(mlContext.Transforms.Text.ProduceCharactersAsKeys("MessageChars", "Message"))
.Append(new NgramExtractingEstimator(mlContext, "BagOfTrichar", "MessageChars",
ngramLength: 3, weighting: NgramExtractingEstimator.WeightingCriteria.TfIdf))

Expand Down