Skip to content

Support ByteLevel encoding in Bpe tokenizer to support DeepSeek model #7425

New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Merged
merged 3 commits into from
Mar 19, 2025
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
2 changes: 1 addition & 1 deletion eng/Versions.props
Original file line number Diff line number Diff line change
Expand Up @@ -100,7 +100,7 @@
<MicrosoftMLTensorFlowTestModelsVersion>0.0.13-test</MicrosoftMLTensorFlowTestModelsVersion>
<MicrosoftMLTestDatabasesVersion>0.0.6-test</MicrosoftMLTestDatabasesVersion>
<MicrosoftMLTestModelsVersion>0.0.7-test</MicrosoftMLTestModelsVersion>
<MicrosoftMLTestTokenizersVersion>2.0.0-beta.25126.1</MicrosoftMLTestTokenizersVersion>
<MicrosoftMLTestTokenizersVersion>2.0.0-beta.25161.1</MicrosoftMLTestTokenizersVersion>
<SystemDataSqlClientVersion>4.9.0</SystemDataSqlClientVersion>
<SystemDataSQLiteCoreVersion>1.0.118</SystemDataSQLiteCoreVersion>
<XunitCombinatorialVersion>1.6.24</XunitCombinatorialVersion>
Expand Down
509 changes: 481 additions & 28 deletions src/Microsoft.ML.Tokenizers/Model/BPETokenizer.cs

Large diffs are not rendered by default.

100 changes: 100 additions & 0 deletions src/Microsoft.ML.Tokenizers/Model/BpeOptions.cs
Original file line number Diff line number Diff line change
@@ -0,0 +1,100 @@
// Licensed to the .NET Foundation under one or more agreements.
// The .NET Foundation licenses this file to you under the MIT license.
// See the LICENSE file in the project root for more information.

using System;
using System.Collections.Generic;

namespace Microsoft.ML.Tokenizers
{
/// <summary>
/// Options for the BPE tokenizer.
/// </summary>
public sealed class BpeOptions
{
/// <summary>
/// Initializes a new instance of the <see cref="BpeOptions"/> class.
/// </summary>
public BpeOptions(IEnumerable<(string Token, int Id)> vocabulary)
{
if (vocabulary == null)
{
throw new ArgumentNullException(nameof(vocabulary));
}

Vocabulary = vocabulary;
}

/// <summary>
/// Gets or sets the vocabulary to use.
/// </summary>
public IEnumerable<(string Token, int Id)> Vocabulary { get; }

/// <summary>
/// Gets or sets the list of the merge strings used to merge tokens during encoding.
/// </summary>
public IEnumerable<string>? Merges { get; set; }

/// <summary>
/// Gets or sets the optional special tokens to use.
/// </summary>
public Dictionary<string, int>? SpecialTokens { get; set; }

/// <summary>
/// Gets or sets the optional normalizer to normalize the input text before encoding it.
/// </summary>
public Normalizer? Normalizer { get; set; }

/// <summary>
/// Gets or sets the optional pre-tokenizer to split the input text into tokens before encoding it.
/// </summary>
public PreTokenizer? PreTokenizer { get; set; }

/// <summary>
/// Gets or sets the Unknown token.
/// </summary>
public string? UnknownToken { get; set; }

/// <summary>
/// Gets or sets a value indicating whether to merge the sequence of the unknown tokens together.
/// </summary>
public bool FuseUnknownTokens { get; set; }

/// <summary>
/// Gets or sets the optional prefix to be used for every subword that is not a beginning-of-word token
/// </summary>
public string? ContinuingSubwordPrefix { get; set; }

/// <summary>
/// Gets or sets the optional suffix to characterize the end-of-word and sub-word
/// </summary>
public string? EndOfWordSuffix { get; set; }

/// <summary>
/// Gets or sets a value indicating whether to handle the input text in byte level.
/// if true, the input text will be converted to UTF-8 bytes before encoding it.
/// Additionally, some ASCII characters will be transformed to different characters (e.g Space character will be transformed to 'Ġ' character).
/// </summary>
public bool ByteLevel { get; set; }

/// <summary>
/// Gets or sets the optional beginning of sentence token to be used when encoding the input text.
/// </summary>
/// <remarks>
/// When specified, this token will be added to the beginning of the input text before encoding it.
/// This is useful for models that require a specific token to indicate the start of a sentence.
/// This token should be present in the vocabulary.
/// </remarks>
public string? BeginningOfSentenceToken { get; set; }

/// <summary>
/// Gets or sets the optional end of sentence token to be used when encoding the input text.
/// </summary>
/// <remarks>
/// When specified, this token will be added to the end of the input text before encoding it.
/// This is useful for models that require a specific token to indicate the end of a sentence.
/// This token should be present in the vocabulary.
/// </remarks>
public string? EndOfSentenceToken { get; set; }
}
}
29 changes: 4 additions & 25 deletions src/Microsoft.ML.Tokenizers/Model/CodeGenTokenizer.cs
Original file line number Diff line number Diff line change
Expand Up @@ -1263,7 +1263,7 @@ public string Decode(IEnumerable<int> ids, bool hasPrefixSpace, bool considerSpe
{
if (considerSpecialTokens)
{
AppendToBytesArray(BeginningOfSentenceToken!.AsSpan(), ref bytes, ref bytesIndex);
Helpers.AppendToBytesArray(BeginningOfSentenceToken!.AsSpan(), ref bytes, ref bytesIndex);
}
continue;
}
Expand All @@ -1272,7 +1272,7 @@ public string Decode(IEnumerable<int> ids, bool hasPrefixSpace, bool considerSpe
{
if (considerSpecialTokens)
{
AppendToBytesArray(EndOfSentenceToken!.AsSpan(), ref bytes, ref bytesIndex);
Helpers.AppendToBytesArray(EndOfSentenceToken!.AsSpan(), ref bytes, ref bytesIndex);
}
continue;
}
Expand All @@ -1281,7 +1281,7 @@ public string Decode(IEnumerable<int> ids, bool hasPrefixSpace, bool considerSpe
{
if (considerSpecialTokens)
{
AppendToBytesArray(UnknownToken!.AsSpan(), ref bytes, ref bytesIndex);
Helpers.AppendToBytesArray(UnknownToken!.AsSpan(), ref bytes, ref bytesIndex);
}
continue;
}
Expand All @@ -1306,7 +1306,7 @@ public string Decode(IEnumerable<int> ids, bool hasPrefixSpace, bool considerSpe
{
ReadOnlySpan<char> span = firstToken && hasPrefixSpace && s.Length > 0 && s[0] == _transformedSpace ? s.AsSpan(1) : s.AsSpan();
firstToken = false;
AppendToBytesArray(span, ref bytes, ref bytesIndex);
Helpers.AppendToBytesArray(span, ref bytes, ref bytesIndex);
}
}

Expand Down Expand Up @@ -1564,27 +1564,6 @@ public OperationStatus Decode(IEnumerable<int> ids, Span<char> destination, bool
return null;
}

private void AppendToBytesArray(ReadOnlySpan<char> text, ref byte[] bytes, ref int bytesIndex)
{
IReadOnlyDictionary<char, char> unicodeToByte = ByteToUnicodeEncoding.Instance.UnicodeToByte;
for (int i = 0; i < text.Length; i++)
{
if (unicodeToByte.TryGetValue(text[i], out char c))
{
if (bytesIndex >= bytes.Length)
{
Helpers.ArrayPoolGrow<byte>(ref bytes, bytes.Length * 2);
}

bytes[bytesIndex++] = (byte)c;
continue;
}

// rare cases
i += Helpers.EncodeCodePointToUtf8(text, i, ref bytes, ref bytesIndex) - 1;
}
}

//
// Private & Internal methods
//
Expand Down
26 changes: 21 additions & 5 deletions src/Microsoft.ML.Tokenizers/Model/Word.cs
Original file line number Diff line number Diff line change
Expand Up @@ -289,15 +289,31 @@ public override string ToString()
return sb.ToString();
}

public void ToTokens(SortedDictionary<int, string> vocabReverse, List<EncodedToken> tokens, int offset)
public void ToTokens(SortedDictionary<int, string> vocabReverse, List<EncodedToken> tokens, int offset, ReadOnlySpan<int> mapping)
{
int index = 0;

for (int i = 0; i < SymbolsCount; i++)
if (mapping.IsEmpty)
{
for (int i = 0; i < SymbolsCount; i++)
{
int endIndex = index + _symbols[i].Len;
tokens.Add(new EncodedToken(_symbols[i].C, vocabReverse[_symbols[i].C], new Range(index + offset, index + offset + _symbols[i].Len)));
index += _symbols[i].Len;
}
}
else
{
int endIndex = index + _symbols[i].Len;
tokens.Add(new EncodedToken(_symbols[i].C, vocabReverse[_symbols[i].C], new Range(index + offset, index + offset + _symbols[i].Len)));
index += _symbols[i].Len;
for (int i = 0; i < SymbolsCount; i++)
{
int endIndex = index + _symbols[i].Len;

int mappedIndex = mapping[index];
int mappedEndIndex = mapping[endIndex - 1] + 1;

tokens.Add(new EncodedToken(_symbols[i].C, vocabReverse[_symbols[i].C], new Range(mappedIndex + offset, mappedEndIndex + offset)));
index += _symbols[i].Len;
}
}
}
}
Expand Down
Loading