Skip to content

Commit d4f690c

Browse files
tarekghCopilot
andauthored
Support ByteLevel encoding in Bpe tokenizer to support DeepSeek model (#7425)
* Support ByteLevel encoding in Bpe tokenizer to support DeepSeek model * Update src/Microsoft.ML.Tokenizers/Model/BpeOptions.cs Co-authored-by: Copilot <[email protected]> * Feedback 1 --------- Co-authored-by: Copilot <[email protected]>
1 parent adad40c commit d4f690c

File tree

8 files changed

+1332
-85
lines changed

8 files changed

+1332
-85
lines changed

eng/Versions.props

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -100,7 +100,7 @@
100100
<MicrosoftMLTensorFlowTestModelsVersion>0.0.13-test</MicrosoftMLTensorFlowTestModelsVersion>
101101
<MicrosoftMLTestDatabasesVersion>0.0.6-test</MicrosoftMLTestDatabasesVersion>
102102
<MicrosoftMLTestModelsVersion>0.0.7-test</MicrosoftMLTestModelsVersion>
103-
<MicrosoftMLTestTokenizersVersion>2.0.0-beta.25126.1</MicrosoftMLTestTokenizersVersion>
103+
<MicrosoftMLTestTokenizersVersion>2.0.0-beta.25161.1</MicrosoftMLTestTokenizersVersion>
104104
<SystemDataSqlClientVersion>4.9.0</SystemDataSqlClientVersion>
105105
<SystemDataSQLiteCoreVersion>1.0.118</SystemDataSQLiteCoreVersion>
106106
<XunitCombinatorialVersion>1.6.24</XunitCombinatorialVersion>

src/Microsoft.ML.Tokenizers/Model/BPETokenizer.cs

Lines changed: 481 additions & 28 deletions
Large diffs are not rendered by default.
Lines changed: 100 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,100 @@
1+
// Licensed to the .NET Foundation under one or more agreements.
2+
// The .NET Foundation licenses this file to you under the MIT license.
3+
// See the LICENSE file in the project root for more information.
4+
5+
using System;
6+
using System.Collections.Generic;
7+
8+
namespace Microsoft.ML.Tokenizers
9+
{
10+
/// <summary>
11+
/// Options for the BPE tokenizer.
12+
/// </summary>
13+
public sealed class BpeOptions
14+
{
15+
/// <summary>
16+
/// Initializes a new instance of the <see cref="BpeOptions"/> class.
17+
/// </summary>
18+
public BpeOptions(IEnumerable<(string Token, int Id)> vocabulary)
19+
{
20+
if (vocabulary == null)
21+
{
22+
throw new ArgumentNullException(nameof(vocabulary));
23+
}
24+
25+
Vocabulary = vocabulary;
26+
}
27+
28+
/// <summary>
29+
/// Gets or sets the vocabulary to use.
30+
/// </summary>
31+
public IEnumerable<(string Token, int Id)> Vocabulary { get; }
32+
33+
/// <summary>
34+
/// Gets or sets the list of the merge strings used to merge tokens during encoding.
35+
/// </summary>
36+
public IEnumerable<string>? Merges { get; set; }
37+
38+
/// <summary>
39+
/// Gets or sets the optional special tokens to use.
40+
/// </summary>
41+
public Dictionary<string, int>? SpecialTokens { get; set; }
42+
43+
/// <summary>
44+
/// Gets or sets the optional normalizer to normalize the input text before encoding it.
45+
/// </summary>
46+
public Normalizer? Normalizer { get; set; }
47+
48+
/// <summary>
49+
/// Gets or sets the optional pre-tokenizer to split the input text into tokens before encoding it.
50+
/// </summary>
51+
public PreTokenizer? PreTokenizer { get; set; }
52+
53+
/// <summary>
54+
/// Gets or sets the Unknown token.
55+
/// </summary>
56+
public string? UnknownToken { get; set; }
57+
58+
/// <summary>
59+
/// Gets or sets a value indicating whether to merge the sequence of the unknown tokens together.
60+
/// </summary>
61+
public bool FuseUnknownTokens { get; set; }
62+
63+
/// <summary>
64+
/// Gets or sets the optional prefix to be used for every subword that is not a beginning-of-word token
65+
/// </summary>
66+
public string? ContinuingSubwordPrefix { get; set; }
67+
68+
/// <summary>
69+
/// Gets or sets the optional suffix to characterize the end-of-word and sub-word
70+
/// </summary>
71+
public string? EndOfWordSuffix { get; set; }
72+
73+
/// <summary>
74+
/// Gets or sets a value indicating whether to handle the input text in byte level.
75+
/// if true, the input text will be converted to UTF-8 bytes before encoding it.
76+
/// Additionally, some ASCII characters will be transformed to different characters (e.g Space character will be transformed to 'Ġ' character).
77+
/// </summary>
78+
public bool ByteLevel { get; set; }
79+
80+
/// <summary>
81+
/// Gets or sets the optional beginning of sentence token to be used when encoding the input text.
82+
/// </summary>
83+
/// <remarks>
84+
/// When specified, this token will be added to the beginning of the input text before encoding it.
85+
/// This is useful for models that require a specific token to indicate the start of a sentence.
86+
/// This token should be present in the vocabulary.
87+
/// </remarks>
88+
public string? BeginningOfSentenceToken { get; set; }
89+
90+
/// <summary>
91+
/// Gets or sets the optional end of sentence token to be used when encoding the input text.
92+
/// </summary>
93+
/// <remarks>
94+
/// When specified, this token will be added to the end of the input text before encoding it.
95+
/// This is useful for models that require a specific token to indicate the end of a sentence.
96+
/// This token should be present in the vocabulary.
97+
/// </remarks>
98+
public string? EndOfSentenceToken { get; set; }
99+
}
100+
}

src/Microsoft.ML.Tokenizers/Model/CodeGenTokenizer.cs

Lines changed: 4 additions & 25 deletions
Original file line numberDiff line numberDiff line change
@@ -1263,7 +1263,7 @@ public string Decode(IEnumerable<int> ids, bool hasPrefixSpace, bool considerSpe
12631263
{
12641264
if (considerSpecialTokens)
12651265
{
1266-
AppendToBytesArray(BeginningOfSentenceToken!.AsSpan(), ref bytes, ref bytesIndex);
1266+
Helpers.AppendToBytesArray(BeginningOfSentenceToken!.AsSpan(), ref bytes, ref bytesIndex);
12671267
}
12681268
continue;
12691269
}
@@ -1272,7 +1272,7 @@ public string Decode(IEnumerable<int> ids, bool hasPrefixSpace, bool considerSpe
12721272
{
12731273
if (considerSpecialTokens)
12741274
{
1275-
AppendToBytesArray(EndOfSentenceToken!.AsSpan(), ref bytes, ref bytesIndex);
1275+
Helpers.AppendToBytesArray(EndOfSentenceToken!.AsSpan(), ref bytes, ref bytesIndex);
12761276
}
12771277
continue;
12781278
}
@@ -1281,7 +1281,7 @@ public string Decode(IEnumerable<int> ids, bool hasPrefixSpace, bool considerSpe
12811281
{
12821282
if (considerSpecialTokens)
12831283
{
1284-
AppendToBytesArray(UnknownToken!.AsSpan(), ref bytes, ref bytesIndex);
1284+
Helpers.AppendToBytesArray(UnknownToken!.AsSpan(), ref bytes, ref bytesIndex);
12851285
}
12861286
continue;
12871287
}
@@ -1306,7 +1306,7 @@ public string Decode(IEnumerable<int> ids, bool hasPrefixSpace, bool considerSpe
13061306
{
13071307
ReadOnlySpan<char> span = firstToken && hasPrefixSpace && s.Length > 0 && s[0] == _transformedSpace ? s.AsSpan(1) : s.AsSpan();
13081308
firstToken = false;
1309-
AppendToBytesArray(span, ref bytes, ref bytesIndex);
1309+
Helpers.AppendToBytesArray(span, ref bytes, ref bytesIndex);
13101310
}
13111311
}
13121312

@@ -1564,27 +1564,6 @@ public OperationStatus Decode(IEnumerable<int> ids, Span<char> destination, bool
15641564
return null;
15651565
}
15661566

1567-
private void AppendToBytesArray(ReadOnlySpan<char> text, ref byte[] bytes, ref int bytesIndex)
1568-
{
1569-
IReadOnlyDictionary<char, char> unicodeToByte = ByteToUnicodeEncoding.Instance.UnicodeToByte;
1570-
for (int i = 0; i < text.Length; i++)
1571-
{
1572-
if (unicodeToByte.TryGetValue(text[i], out char c))
1573-
{
1574-
if (bytesIndex >= bytes.Length)
1575-
{
1576-
Helpers.ArrayPoolGrow<byte>(ref bytes, bytes.Length * 2);
1577-
}
1578-
1579-
bytes[bytesIndex++] = (byte)c;
1580-
continue;
1581-
}
1582-
1583-
// rare cases
1584-
i += Helpers.EncodeCodePointToUtf8(text, i, ref bytes, ref bytesIndex) - 1;
1585-
}
1586-
}
1587-
15881567
//
15891568
// Private & Internal methods
15901569
//

src/Microsoft.ML.Tokenizers/Model/Word.cs

Lines changed: 21 additions & 5 deletions
Original file line numberDiff line numberDiff line change
@@ -289,15 +289,31 @@ public override string ToString()
289289
return sb.ToString();
290290
}
291291

292-
public void ToTokens(SortedDictionary<int, string> vocabReverse, List<EncodedToken> tokens, int offset)
292+
public void ToTokens(SortedDictionary<int, string> vocabReverse, List<EncodedToken> tokens, int offset, ReadOnlySpan<int> mapping)
293293
{
294294
int index = 0;
295295

296-
for (int i = 0; i < SymbolsCount; i++)
296+
if (mapping.IsEmpty)
297+
{
298+
for (int i = 0; i < SymbolsCount; i++)
299+
{
300+
int endIndex = index + _symbols[i].Len;
301+
tokens.Add(new EncodedToken(_symbols[i].C, vocabReverse[_symbols[i].C], new Range(index + offset, index + offset + _symbols[i].Len)));
302+
index += _symbols[i].Len;
303+
}
304+
}
305+
else
297306
{
298-
int endIndex = index + _symbols[i].Len;
299-
tokens.Add(new EncodedToken(_symbols[i].C, vocabReverse[_symbols[i].C], new Range(index + offset, index + offset + _symbols[i].Len)));
300-
index += _symbols[i].Len;
307+
for (int i = 0; i < SymbolsCount; i++)
308+
{
309+
int endIndex = index + _symbols[i].Len;
310+
311+
int mappedIndex = mapping[index];
312+
int mappedEndIndex = mapping[endIndex - 1] + 1;
313+
314+
tokens.Add(new EncodedToken(_symbols[i].C, vocabReverse[_symbols[i].C], new Range(mappedIndex + offset, mappedEndIndex + offset)));
315+
index += _symbols[i].Len;
316+
}
301317
}
302318
}
303319
}

0 commit comments

Comments
 (0)