dotnet
diff --git a/‎eng/Versions.props
Lines changed: 1 addition & 1 deletion b/‎eng/Versions.props
Lines changed: 1 addition & 1 deletion
diff --git a/‎src/Microsoft.ML.Tokenizers/Model/BPETokenizer.cs
Lines changed: 481 additions & 28 deletions b/‎src/Microsoft.ML.Tokenizers/Model/BPETokenizer.cs
Lines changed: 481 additions & 28 deletions
diff --git a/‎src/Microsoft.ML.Tokenizers/Model/BpeOptions.cs
Lines changed: 100 additions & 0 deletions b/‎src/Microsoft.ML.Tokenizers/Model/BpeOptions.cs
Lines changed: 100 additions & 0 deletions
diff --git a/‎src/Microsoft.ML.Tokenizers/Model/CodeGenTokenizer.cs
Lines changed: 4 additions & 25 deletions b/‎src/Microsoft.ML.Tokenizers/Model/CodeGenTokenizer.cs
Lines changed: 4 additions & 25 deletions
diff --git a/‎src/Microsoft.ML.Tokenizers/Model/Word.cs
Lines changed: 21 additions & 5 deletions b/‎src/Microsoft.ML.Tokenizers/Model/Word.cs
Lines changed: 21 additions & 5 deletions
@@ -100,7 +100,7 @@
     <MicrosoftMLTensorFlowTestModelsVersion>0.0.13-test</MicrosoftMLTensorFlowTestModelsVersion>
     <MicrosoftMLTestDatabasesVersion>0.0.6-test</MicrosoftMLTestDatabasesVersion>
     <MicrosoftMLTestModelsVersion>0.0.7-test</MicrosoftMLTestModelsVersion>
-    <MicrosoftMLTestTokenizersVersion>2.0.0-beta.25126.1</MicrosoftMLTestTokenizersVersion>
+    <MicrosoftMLTestTokenizersVersion>2.0.0-beta.25161.1</MicrosoftMLTestTokenizersVersion>
     <SystemDataSqlClientVersion>4.9.0</SystemDataSqlClientVersion>
     <SystemDataSQLiteCoreVersion>1.0.118</SystemDataSQLiteCoreVersion>
     <XunitCombinatorialVersion>1.6.24</XunitCombinatorialVersion>
 
@@ -0,0 +1,100 @@
+// Licensed to the .NET Foundation under one or more agreements.
+// The .NET Foundation licenses this file to you under the MIT license.
+// See the LICENSE file in the project root for more information.
+
+using System;
+using System.Collections.Generic;
+
+namespace Microsoft.ML.Tokenizers
+{
+    /// <summary>
+    /// Options for the BPE tokenizer.
+    /// </summary>
+    public sealed class BpeOptions
+    {
+        /// <summary>
+        /// Initializes a new instance of the <see cref="BpeOptions"/> class.
+        /// </summary>
+        public BpeOptions(IEnumerable<(string Token, int Id)> vocabulary)
+        {
+            if (vocabulary == null)
+            {
+                throw new ArgumentNullException(nameof(vocabulary));
+            }
+
+            Vocabulary = vocabulary;
+        }
+
+        /// <summary>
+        /// Gets or sets the vocabulary to use.
+        /// </summary>
+        public IEnumerable<(string Token, int Id)> Vocabulary { get; }
+
+        /// <summary>
+        /// Gets or sets the list of the merge strings used to merge tokens during encoding.
+        /// </summary>
+        public IEnumerable<string>? Merges { get; set; }
+
+        /// <summary>
+        /// Gets or sets the optional special tokens to use.
+        /// </summary>
+        public Dictionary<string, int>? SpecialTokens { get; set; }
+
+        /// <summary>
+        /// Gets or sets the optional normalizer to normalize the input text before encoding it.
+        /// </summary>
+        public Normalizer? Normalizer { get; set; }
+
+        /// <summary>
+        /// Gets or sets the optional pre-tokenizer to split the input text into tokens before encoding it.
+        /// </summary>
+        public PreTokenizer? PreTokenizer { get; set; }
+
+        /// <summary>
+        /// Gets or sets the Unknown token.
+        /// </summary>
+        public string? UnknownToken { get; set; }
+
+        /// <summary>
+        /// Gets or sets a value indicating whether to merge the sequence of the unknown tokens together.
+        /// </summary>
+        public bool FuseUnknownTokens { get; set; }
+
+        /// <summary>
+        /// Gets or sets the optional prefix to be used for every subword that is not a beginning-of-word token
+        /// </summary>
+        public string? ContinuingSubwordPrefix { get; set; }
+
+        /// <summary>
+        /// Gets or sets the optional suffix to characterize the end-of-word and sub-word
+        /// </summary>
+        public string? EndOfWordSuffix { get; set; }
+
+        /// <summary>
+        /// Gets or sets a value indicating whether to handle the input text in byte level.
+        /// if true, the input text will be converted to UTF-8 bytes before encoding it.
+        /// Additionally, some ASCII characters will be transformed to different characters (e.g Space character will be transformed to 'Ġ' character).
+        /// </summary>
+        public bool ByteLevel { get; set; }
+
+        /// <summary>
+        /// Gets or sets the optional beginning of sentence token to be used when encoding the input text.
+        /// </summary>
+        /// <remarks>
+        /// When specified, this token will be added to the beginning of the input text before encoding it.
+        /// This is useful for models that require a specific token to indicate the start of a sentence.
+        /// This token should be present in the vocabulary.
+        /// </remarks>
+        public string? BeginningOfSentenceToken { get; set; }
+
+        /// <summary>
+        /// Gets or sets the optional end of sentence token to be used when encoding the input text.
+        /// </summary>
+        /// <remarks>
+        /// When specified, this token will be added to the end of the input text before encoding it.
+        /// This is useful for models that require a specific token to indicate the end of a sentence.
+        /// This token should be present in the vocabulary.
+        /// </remarks>
+        public string? EndOfSentenceToken { get; set; }
+    }
+}
@@ -1263,7 +1263,7 @@ public string Decode(IEnumerable<int> ids, bool hasPrefixSpace, bool considerSpe
                     {
                         if (considerSpecialTokens)
                         {
-                            AppendToBytesArray(BeginningOfSentenceToken!.AsSpan(), ref bytes, ref bytesIndex);
+                            Helpers.AppendToBytesArray(BeginningOfSentenceToken!.AsSpan(), ref bytes, ref bytesIndex);
                         }
                         continue;
                     }
@@ -1272,7 +1272,7 @@ public string Decode(IEnumerable<int> ids, bool hasPrefixSpace, bool considerSpe
                     {
                         if (considerSpecialTokens)
                         {
-                            AppendToBytesArray(EndOfSentenceToken!.AsSpan(), ref bytes, ref bytesIndex);
+                            Helpers.AppendToBytesArray(EndOfSentenceToken!.AsSpan(), ref bytes, ref bytesIndex);
                         }
                         continue;
                     }
@@ -1281,7 +1281,7 @@ public string Decode(IEnumerable<int> ids, bool hasPrefixSpace, bool considerSpe
                     {
                         if (considerSpecialTokens)
                         {
-                            AppendToBytesArray(UnknownToken!.AsSpan(), ref bytes, ref bytesIndex);
+                            Helpers.AppendToBytesArray(UnknownToken!.AsSpan(), ref bytes, ref bytesIndex);
                         }
                         continue;
                     }
@@ -1306,7 +1306,7 @@ public string Decode(IEnumerable<int> ids, bool hasPrefixSpace, bool considerSpe
                     {
                         ReadOnlySpan<char> span = firstToken && hasPrefixSpace && s.Length > 0 && s[0] == _transformedSpace ? s.AsSpan(1) : s.AsSpan();
                         firstToken = false;
-                        AppendToBytesArray(span, ref bytes, ref bytesIndex);
+                        Helpers.AppendToBytesArray(span, ref bytes, ref bytesIndex);
                     }
                 }
 
@@ -1564,27 +1564,6 @@ public OperationStatus Decode(IEnumerable<int> ids, Span<char> destination, bool
             return null;
         }
 
-        private void AppendToBytesArray(ReadOnlySpan<char> text, ref byte[] bytes, ref int bytesIndex)
-        {
-            IReadOnlyDictionary<char, char> unicodeToByte = ByteToUnicodeEncoding.Instance.UnicodeToByte;
-            for (int i = 0; i < text.Length; i++)
-            {
-                if (unicodeToByte.TryGetValue(text[i], out char c))
-                {
-                    if (bytesIndex >= bytes.Length)
-                    {
-                        Helpers.ArrayPoolGrow<byte>(ref bytes, bytes.Length * 2);
-                    }
-
-                    bytes[bytesIndex++] = (byte)c;
-                    continue;
-                }
-
-                // rare cases
-                i += Helpers.EncodeCodePointToUtf8(text, i, ref bytes, ref bytesIndex) - 1;
-            }
-        }
-
         //
         // Private & Internal methods
         //
 
@@ -289,15 +289,31 @@ public override string ToString()
             return sb.ToString();
         }
 
-        public void ToTokens(SortedDictionary<int, string> vocabReverse, List<EncodedToken> tokens, int offset)
+        public void ToTokens(SortedDictionary<int, string> vocabReverse, List<EncodedToken> tokens, int offset, ReadOnlySpan<int> mapping)
         {
             int index = 0;
 
-            for (int i = 0; i < SymbolsCount; i++)
+            if (mapping.IsEmpty)
+            {
+                for (int i = 0; i < SymbolsCount; i++)
+                {
+                    int endIndex = index + _symbols[i].Len;
+                    tokens.Add(new EncodedToken(_symbols[i].C, vocabReverse[_symbols[i].C], new Range(index + offset, index + offset + _symbols[i].Len)));
+                    index += _symbols[i].Len;
+                }
+            }
+            else
             {
-                int endIndex = index + _symbols[i].Len;
-                tokens.Add(new EncodedToken(_symbols[i].C, vocabReverse[_symbols[i].C], new Range(index + offset, index + offset + _symbols[i].Len)));
-                index += _symbols[i].Len;
+                for (int i = 0; i < SymbolsCount; i++)
+                {
+                    int endIndex = index + _symbols[i].Len;
+
+                    int mappedIndex = mapping[index];
+                    int mappedEndIndex = mapping[endIndex - 1] + 1;
+
+                    tokens.Add(new EncodedToken(_symbols[i].C, vocabReverse[_symbols[i].C], new Range(mappedIndex + offset, mappedEndIndex + offset)));
+                    index += _symbols[i].Len;
+                }
             }
         }
     }
Original file line number	Diff line number	Diff line change
`@@ -1263,7 +1263,7 @@ public string Decode(IEnumerable<int> ids, bool hasPrefixSpace, bool considerSpe`
`1263`	`1263`	`{`
`1264`	`1264`	`if (considerSpecialTokens)`
`1265`	`1265`	`{`
`1266`		`- AppendToBytesArray(BeginningOfSentenceToken!.AsSpan(), ref bytes, ref bytesIndex);`
	`1266`	`+ Helpers.AppendToBytesArray(BeginningOfSentenceToken!.AsSpan(), ref bytes, ref bytesIndex);`
`1267`	`1267`	`}`
`1268`	`1268`	`continue;`
`1269`	`1269`	`}`
`@@ -1272,7 +1272,7 @@ public string Decode(IEnumerable<int> ids, bool hasPrefixSpace, bool considerSpe`
`1272`	`1272`	`{`
`1273`	`1273`	`if (considerSpecialTokens)`
`1274`	`1274`	`{`
`1275`		`- AppendToBytesArray(EndOfSentenceToken!.AsSpan(), ref bytes, ref bytesIndex);`
	`1275`	`+ Helpers.AppendToBytesArray(EndOfSentenceToken!.AsSpan(), ref bytes, ref bytesIndex);`
`1276`	`1276`	`}`
`1277`	`1277`	`continue;`
`1278`	`1278`	`}`
`@@ -1281,7 +1281,7 @@ public string Decode(IEnumerable<int> ids, bool hasPrefixSpace, bool considerSpe`
`1281`	`1281`	`{`
`1282`	`1282`	`if (considerSpecialTokens)`
`1283`	`1283`	`{`
`1284`		`- AppendToBytesArray(UnknownToken!.AsSpan(), ref bytes, ref bytesIndex);`
	`1284`	`+ Helpers.AppendToBytesArray(UnknownToken!.AsSpan(), ref bytes, ref bytesIndex);`
`1285`	`1285`	`}`
`1286`	`1286`	`continue;`
`1287`	`1287`	`}`
`@@ -1306,7 +1306,7 @@ public string Decode(IEnumerable<int> ids, bool hasPrefixSpace, bool considerSpe`
`1306`	`1306`	`{`
`1307`	`1307`	`ReadOnlySpan<char> span = firstToken && hasPrefixSpace && s.Length > 0 && s[0] == _transformedSpace ? s.AsSpan(1) : s.AsSpan();`
`1308`	`1308`	`firstToken = false;`
`1309`		`- AppendToBytesArray(span, ref bytes, ref bytesIndex);`
	`1309`	`+ Helpers.AppendToBytesArray(span, ref bytes, ref bytesIndex);`
`1310`	`1310`	`}`
`1311`	`1311`	`}`
`1312`	`1312`
`@@ -1564,27 +1564,6 @@ public OperationStatus Decode(IEnumerable<int> ids, Span<char> destination, bool`
`1564`	`1564`	`return null;`
`1565`	`1565`	`}`
`1566`	`1566`
`1567`		`- private void AppendToBytesArray(ReadOnlySpan<char> text, ref byte[] bytes, ref int bytesIndex)`
`1568`		`- {`
`1569`		`- IReadOnlyDictionary<char, char> unicodeToByte = ByteToUnicodeEncoding.Instance.UnicodeToByte;`
`1570`		`- for (int i = 0; i < text.Length; i++)`
`1571`		`- {`
`1572`		`- if (unicodeToByte.TryGetValue(text[i], out char c))`
`1573`		`- {`
`1574`		`- if (bytesIndex >= bytes.Length)`
`1575`		`- {`
`1576`		`- Helpers.ArrayPoolGrow<byte>(ref bytes, bytes.Length * 2);`
`1577`		`- }`
`1578`		`-`
`1579`		`- bytes[bytesIndex++] = (byte)c;`
`1580`		`- continue;`
`1581`		`- }`
`1582`		`-`
`1583`		`- // rare cases`
`1584`		`- i += Helpers.EncodeCodePointToUtf8(text, i, ref bytes, ref bytesIndex) - 1;`
`1585`		`- }`
`1586`		`- }`
`1587`		`-`
`1588`	`1567`	`//`
`1589`	`1568`	`// Private & Internal methods`
`1590`	`1569`	`//`