Skip to content

Commit bad8298

Browse files
authored
Adding needed Tokenizer's APIs (#7047)
* Adding needed Tokenizer's APIs * Address the feedback * Small update to the newly exposed APIs * fix comments * Update the APIs signatures * More feedback addressing * Fix the comments
1 parent 8d31a8e commit bad8298

File tree

11 files changed

+541
-363
lines changed

11 files changed

+541
-363
lines changed

src/Microsoft.ML.Tokenizers/AddedToken.cs

-91
This file was deleted.

src/Microsoft.ML.Tokenizers/Model/BPE.cs

+7-7
Original file line numberDiff line numberDiff line change
@@ -176,8 +176,8 @@ private Bpe(Stream vocabStream, Stream? mergesStream, string? unknownToken, stri
176176
/// <summary>
177177
/// Encode a text string to a list of tokens.
178178
/// </summary>
179-
/// <param name="text">The text to encode.</param>
180-
/// <param name="isSpecialToken">Indicate if the token is a special token.</param>
179+
/// <param name="text">The text to encode. If the value of the parameter <paramref name="isSpecialToken"/> is true, the entire text will be treated as a special token.</param>
180+
/// <param name="isSpecialToken">Specifies whether the entire <paramref name="text"/> is considered a special token. This parameter is ignored in this model.</param>
181181
/// <returns>The list of tokens generated from the text tokenization.</returns>
182182
public override IReadOnlyList<Token> Encode(string text, bool isSpecialToken = false)
183183
{
@@ -192,17 +192,17 @@ public override IReadOnlyList<Token> Encode(string text, bool isSpecialToken = f
192192
/// <summary>
193193
/// Encode a split text string to a list of Ids and add them to the accumulatedIds list.
194194
/// </summary>
195-
/// <param name="text">The text to split.</param>
196-
/// <param name="isSpecialToken">Indicate if the token is a special token.</param>
195+
/// <param name="text">The text to encode. If the value of the parameter <paramref name="isSpecialToken"/> is true, the entire text will be treated as a special token.</param>
196+
/// <param name="isSpecialToken">Specifies whether the entire <paramref name="text"/> is considered a special token. This parameter is ignored in this model.</param>
197197
/// <param name="accumulatedIds">The list of accumulated encoded Ids.</param>
198198
public override void EncodeToIds(ReadOnlySpan<char> text, bool isSpecialToken, IList<int> accumulatedIds) => EncodeToIdsWithCache(text, accumulatedIds);
199199

200200
/// <summary>
201201
/// Get the number of tokens that the input text will be encoded to.
202202
/// </summary>
203-
/// <param name="text">The text to encode.</param>
204-
/// <param name="isSpecialToken">Indicate if the token is special token.</param>
205-
/// <returns>The number of tokens that the input text will be encoded to.</returns>
203+
/// <param name="text">The text to encode. If the value of the parameter <paramref name="isSpecialToken"/> is true, the entire text will be treated as a special token.</param>
204+
/// <param name="isSpecialToken">Specifies whether the entire <paramref name="text"/> is considered a special token. This parameter is ignored in this model.</param>
205+
/// <returns>The number of tokens that the input text will be encoded to. This parameter is ignored in this model.</returns>
206206
public override int CountTokens(ReadOnlySpan<char> text, bool isSpecialToken) => EncodeToIdsWithCache(text, null);
207207

208208
/// <summary>

src/Microsoft.ML.Tokenizers/Model/EnglishRoberta.cs

+6-6
Original file line numberDiff line numberDiff line change
@@ -176,8 +176,8 @@ public EnglishRoberta(Stream vocabularyStream, Stream mergeStream, Stream highes
176176
/// <summary>
177177
/// Encode a text string to a list of tokens.
178178
/// </summary>
179-
/// <param name="text">The text to encode.</param>
180-
/// <param name="isSpecialToken">Indicate if the token is a special token.</param>
179+
/// <param name="text">The text to encode. If the value of the parameter <paramref name="isSpecialToken"/> is true, the entire text will be treated as a special token.</param>
180+
/// <param name="isSpecialToken">Specifies whether the entire <paramref name="text"/> is considered a special token. This parameter is ignored in this model.</param>
181181
/// <returns>The list of tokens generated from the text tokenization.</returns>
182182
public override IReadOnlyList<Token> Encode(string text, bool isSpecialToken = false)
183183
{
@@ -224,16 +224,16 @@ public override IReadOnlyList<Token> Encode(string text, bool isSpecialToken = f
224224
/// <summary>
225225
/// Encode a split text string to a list of Ids and add them to the accumulatedIds list.
226226
/// </summary>
227-
/// <param name="text">The text to split.</param>
228-
/// <param name="isSpecialToken">Indicate if the token is a special token.</param>
227+
/// <param name="text">The text to encode. If the value of the parameter <paramref name="isSpecialToken"/> is true, the entire text will be treated as a special token.</param>
228+
/// <param name="isSpecialToken">Specifies whether the entire <paramref name="text"/> is considered a special token. This parameter is ignored in this model.</param>
229229
/// <param name="accumulatedIds">The list of accumulated encoded Ids.</param>
230230
public override void EncodeToIds(ReadOnlySpan<char> text, bool isSpecialToken, IList<int> accumulatedIds) => EncodeToIds(text, accumulatedIds);
231231

232232
/// <summary>
233233
/// Get the number of tokens that the input text will be encoded to.
234234
/// </summary>
235-
/// <param name="text">The text to encode.</param>
236-
/// <param name="isSpecialToken">Indicate if the token is special token.</param>
235+
/// <param name="text">The text to encode. If the value of the parameter <paramref name="isSpecialToken"/> is true, the entire text will be treated as a special token.</param>
236+
/// <param name="isSpecialToken">Specifies whether the entire <paramref name="text"/> is considered a special token. This parameter is ignored in this model.</param>
237237
/// <returns>The number of tokens that the input text will be encoded to.</returns>
238238
public override int CountTokens(ReadOnlySpan<char> text, bool isSpecialToken) => EncodeToIds(text, null);
239239

src/Microsoft.ML.Tokenizers/Model/Model.cs

+6-6
Original file line numberDiff line numberDiff line change
@@ -16,16 +16,16 @@ public abstract class Model
1616
/// <summary>
1717
/// Encode a text to a list of tokens.
1818
/// </summary>
19-
/// <param name="text">The text to encode.</param>
20-
/// <param name="isSpecialToken">Indicate if the token is a special token.</param>
19+
/// <param name="text">The text to encode. If the value of the parameter <paramref name="isSpecialToken"/> is true, the entire text will be treated as a special token.</param>
20+
/// <param name="isSpecialToken">Specifies whether the entire <paramref name="text"/> is considered a special token.</param>
2121
/// <returns>The list of tokens generated from the text tokenization.</returns>
2222
public abstract IReadOnlyList<Token> Encode(string text, bool isSpecialToken = false);
2323

2424
/// <summary>
2525
/// Encode a text to a list of Ids and add them to the accumulatedIds list.
2626
/// </summary>
27-
/// <param name="text">The text to encode.</param>
28-
/// <param name="isSpecialToken">Indicate if the token is a special token.</param>
27+
/// <param name="text">The text to encode. If the value of the parameter <paramref name="isSpecialToken"/> is true, the entire text will be treated as a special token.</param>
28+
/// <param name="isSpecialToken">Specifies whether the entire <paramref name="text"/> is considered a special token.</param>
2929
/// <param name="accumulatedIds">The list of accumulated encoded Ids.</param>
3030
/// <remarks>
3131
/// This method does the default implementation that uses the Encode method to get the token's Ids.
@@ -49,8 +49,8 @@ public virtual void EncodeToIds(ReadOnlySpan<char> text, bool isSpecialToken, IL
4949
/// <summary>
5050
/// Get the number of tokens that the input text will be encoded to.
5151
/// </summary>
52-
/// <param name="text">The text to encode.</param>
53-
/// <param name="isSpecialToken">Indicate if the token is special token.</param>
52+
/// <param name="text">The text to encode. If the value of the parameter <paramref name="isSpecialToken"/> is true, the entire text will be treated as a special token.</param>
53+
/// <param name="isSpecialToken">Specifies whether the entire <paramref name="text"/> is considered a special token.</param>
5454
/// <returns>The number of tokens that the input text will be encoded to.</returns>
5555
/// <remarks>
5656
/// This method does the default implementation that uses the EncodeToIds method to get the number of token's Ids.

0 commit comments

Comments
 (0)