dotnet · tarekgh · Mar 19, 2025 · Mar 18, 2025 · Mar 18, 2025 · Mar 19, 2025
diff --git a/eng/Versions.props b/eng/Versions.props
@@ -100,7 +100,7 @@
     <MicrosoftMLTensorFlowTestModelsVersion>0.0.13-test</MicrosoftMLTensorFlowTestModelsVersion>
     <MicrosoftMLTestDatabasesVersion>0.0.6-test</MicrosoftMLTestDatabasesVersion>
     <MicrosoftMLTestModelsVersion>0.0.7-test</MicrosoftMLTestModelsVersion>
-    <MicrosoftMLTestTokenizersVersion>2.0.0-beta.25126.1</MicrosoftMLTestTokenizersVersion>
+    <MicrosoftMLTestTokenizersVersion>2.0.0-beta.25161.1</MicrosoftMLTestTokenizersVersion>
     <SystemDataSqlClientVersion>4.9.0</SystemDataSqlClientVersion>
     <SystemDataSQLiteCoreVersion>1.0.118</SystemDataSQLiteCoreVersion>
     <XunitCombinatorialVersion>1.6.24</XunitCombinatorialVersion>

diff --git a/src/Microsoft.ML.Tokenizers/Model/BPETokenizer.cs b/src/Microsoft.ML.Tokenizers/Model/BPETokenizer.cs
diff --git a/src/Microsoft.ML.Tokenizers/Model/BpeOptions.cs b/src/Microsoft.ML.Tokenizers/Model/BpeOptions.cs
@@ -0,0 +1,100 @@
+// Licensed to the .NET Foundation under one or more agreements.
+// The .NET Foundation licenses this file to you under the MIT license.
+// See the LICENSE file in the project root for more information.
+
+using System;
+using System.Collections.Generic;
+
+namespace Microsoft.ML.Tokenizers
+{
+    /// <summary>
+    /// Options for the BPE tokenizer.
+    /// </summary>
+    public sealed class BpeOptions
+    {
+        /// <summary>
+        /// Initializes a new instance of the <see cref="BpeOptions"/> class.
+        /// </summary>
+        public BpeOptions(IEnumerable<(string Token, int Id)> vocabulary)
+        {
+            if (vocabulary == null)
+            {
+                throw new ArgumentNullException(nameof(vocabulary));
+            }
+
+            Vocabulary = vocabulary;
+        }
+
+        /// <summary>
+        /// Gets or sets the vocabulary to use.
+        /// </summary>
+        public IEnumerable<(string Token, int Id)> Vocabulary { get; }
+
+        /// <summary>
+        /// Gets or sets the list of the merge strings used to merge tokens during encoding.
+        /// </summary>
+        public IEnumerable<string>? Merges { get; set; }
+
+        /// <summary>
+        /// Gets or sets the optional special tokens to use.
+        /// </summary>
+        public Dictionary<string, int>? SpecialTokens { get; set; }
+
+        /// <summary>
+        /// Gets or sets the optional normalizer to normalize the input text before encoding it.
+        /// </summary>
+        public Normalizer? Normalizer { get; set; }
+
+        /// <summary>
+        /// Gets or sets the optional pre-tokenizer to split the input text into tokens before encoding it.
+        /// </summary>
+        public PreTokenizer? PreTokenizer { get; set; }
+
+        /// <summary>
+        /// Gets or sets the Unknown token.
+        /// </summary>
+        public string? UnknownToken { get; set; }
+
+        /// <summary>
+        /// Gets or sets a value indicating whether to merge the sequence of the unknown tokens together.
+        /// </summary>
+        public bool FuseUnknownTokens { get; set; }
+
+        /// <summary>
+        /// Gets or sets the optional prefix to be used for every subword that is not a beginning-of-word token
+        /// </summary>
+        public string? ContinuingSubwordPrefix { get; set; }
+
+        /// <summary>
+        /// Gets or sets the optional suffix to characterize the end-of-word and sub-word
+        /// </summary>
+        public string? EndOfWordSuffix { get; set; }
+
+        /// <summary>
+        /// Gets or sets a value indicating whether to handle the input text in byte level.
+        /// if true, the input text will be converted to UTF-8 bytes before encoding it.
+        /// Additionally, some ASCII characters will be transformed to different characters (e.g Space character will be transformed to 'Ġ' character).
+        /// </summary>
+        public bool ByteLevel { get; set; }
+
+        /// <summary>
+        /// Gets or sets the optional beginning of sentence token to be used when encoding the input text.
+        /// </summary>
+        /// <remarks>
+        /// When specified, this token will be added to the beginning of the input text before encoding it.
+        /// This is useful for models that require a specific token to indicate the start of a sentence.
+        /// This token should be present in the vocabulary.
+        /// </remarks>
+        public string? BeginningOfSentenceToken { get; set; }
+
+        /// <summary>
+        /// Gets or sets the optional end of sentence token to be used when encoding the input text.
+        /// </summary>
+        /// <remarks>
+        /// When specified, this token will be added to the end of the input text before encoding it.
+        /// This is useful for models that require a specific token to indicate the end of a sentence.
+        /// This token should be present in the vocabulary.
+        /// </remarks>
+        public string? EndOfSentenceToken { get; set; }
+    }
+}
diff --git a/src/Microsoft.ML.Tokenizers/Model/CodeGenTokenizer.cs b/src/Microsoft.ML.Tokenizers/Model/CodeGenTokenizer.cs
@@ -1263,7 +1263,7 @@ public string Decode(IEnumerable<int> ids, bool hasPrefixSpace, bool considerSpe
                     {
                         if (considerSpecialTokens)
                         {
-                            AppendToBytesArray(BeginningOfSentenceToken!.AsSpan(), ref bytes, ref bytesIndex);
+                            Helpers.AppendToBytesArray(BeginningOfSentenceToken!.AsSpan(), ref bytes, ref bytesIndex);
                         }
                         continue;
                     }
@@ -1272,7 +1272,7 @@ public string Decode(IEnumerable<int> ids, bool hasPrefixSpace, bool considerSpe
                     {
                         if (considerSpecialTokens)
                         {
-                            AppendToBytesArray(EndOfSentenceToken!.AsSpan(), ref bytes, ref bytesIndex);
+                            Helpers.AppendToBytesArray(EndOfSentenceToken!.AsSpan(), ref bytes, ref bytesIndex);
                         }
                         continue;
                     }
@@ -1281,7 +1281,7 @@ public string Decode(IEnumerable<int> ids, bool hasPrefixSpace, bool considerSpe
                     {
                         if (considerSpecialTokens)
                         {
-                            AppendToBytesArray(UnknownToken!.AsSpan(), ref bytes, ref bytesIndex);
+                            Helpers.AppendToBytesArray(UnknownToken!.AsSpan(), ref bytes, ref bytesIndex);
                         }
                         continue;
                     }
@@ -1306,7 +1306,7 @@ public string Decode(IEnumerable<int> ids, bool hasPrefixSpace, bool considerSpe
                     {
                         ReadOnlySpan<char> span = firstToken && hasPrefixSpace && s.Length > 0 && s[0] == _transformedSpace ? s.AsSpan(1) : s.AsSpan();
                         firstToken = false;
-                        AppendToBytesArray(span, ref bytes, ref bytesIndex);
+                        Helpers.AppendToBytesArray(span, ref bytes, ref bytesIndex);
                     }
                 }
 
@@ -1564,27 +1564,6 @@ public OperationStatus Decode(IEnumerable<int> ids, Span<char> destination, bool
             return null;
         }
 
-        private void AppendToBytesArray(ReadOnlySpan<char> text, ref byte[] bytes, ref int bytesIndex)
-        {
-            IReadOnlyDictionary<char, char> unicodeToByte = ByteToUnicodeEncoding.Instance.UnicodeToByte;
-            for (int i = 0; i < text.Length; i++)
-            {
-                if (unicodeToByte.TryGetValue(text[i], out char c))
-                {
-                    if (bytesIndex >= bytes.Length)
-                    {
-                        Helpers.ArrayPoolGrow<byte>(ref bytes, bytes.Length * 2);
-                    }
-
-                    bytes[bytesIndex++] = (byte)c;
-                    continue;
-                }
-
-                // rare cases
-                i += Helpers.EncodeCodePointToUtf8(text, i, ref bytes, ref bytesIndex) - 1;
-            }
-        }
-
         //
         // Private & Internal methods
         //

diff --git a/src/Microsoft.ML.Tokenizers/Model/Word.cs b/src/Microsoft.ML.Tokenizers/Model/Word.cs
@@ -289,15 +289,31 @@ public override string ToString()
             return sb.ToString();
         }
 
-        public void ToTokens(SortedDictionary<int, string> vocabReverse, List<EncodedToken> tokens, int offset)
+        public void ToTokens(SortedDictionary<int, string> vocabReverse, List<EncodedToken> tokens, int offset, ReadOnlySpan<int> mapping)
         {
             int index = 0;
 
-            for (int i = 0; i < SymbolsCount; i++)
+            if (mapping.IsEmpty)
+            {
+                for (int i = 0; i < SymbolsCount; i++)
+                {
+                    int endIndex = index + _symbols[i].Len;
+                    tokens.Add(new EncodedToken(_symbols[i].C, vocabReverse[_symbols[i].C], new Range(index + offset, index + offset + _symbols[i].Len)));
+                    index += _symbols[i].Len;
+                }
+            }
+            else
             {
-                int endIndex = index + _symbols[i].Len;
-                tokens.Add(new EncodedToken(_symbols[i].C, vocabReverse[_symbols[i].C], new Range(index + offset, index + offset + _symbols[i].Len)));
-                index += _symbols[i].Len;
+                for (int i = 0; i < SymbolsCount; i++)
+                {
+                    int endIndex = index + _symbols[i].Len;
+
+                    int mappedIndex = mapping[index];
+                    int mappedEndIndex = mapping[endIndex - 1] + 1;
+
+                    tokens.Add(new EncodedToken(_symbols[i].C, vocabReverse[_symbols[i].C], new Range(mappedIndex + offset, mappedEndIndex + offset)));
+                    index += _symbols[i].Len;
+                }
             }
         }
     }