Fix copilot changes

dotnet · ericstj · Dec 9, 2024 · Dec 2, 2024 · Dec 4, 2024 · Dec 4, 2024
commit 1362232e150afefa91e61aeaf85620a02b5261b7
diff --git a/src/Microsoft.ML.Tokenizers/Model/BertTokenizer.cs b/src/Microsoft.ML.Tokenizers/Model/BertTokenizer.cs
@@ -781,18 +781,18 @@ private static BertTokenizer Create(
                             }
 
                             // Add the special token into our dictionary, normalizing it, and adding it into the
-                            // main vocab, if needed. 
+                            // main vocab, if needed.
                             AddSpecialToken(vocab, tempSpecialTokens, kvp.Key, true);
                         }
                     }
                 }
                 else
                 {
                     // Create a dictionary with the special tokens - store the un-normalized forms in the options as
-                    // that field is exposed to the public. In addition, store the normalized form for creating the 
+                    // that field is exposed to the public. In addition, store the normalized form for creating the
                     // pre-tokenizer.
-                    Dictionary<string, int> tempSpecialTokens = {};
-                    Dictionary<string, int> tempSpecialTokens = {};
+                    Dictionary<string, int> tempSpecialTokens = new Dictionary<string, int>();
+                    Dictionary<string, int> notNormalizedSpecialTokens = new Dictionary<string, int>();
                     AddSpecialToken(vocab, tempSpecialTokens, options.UnknownToken, lowerCase, notNormalizedSpecialTokens);
                     AddSpecialToken(vocab, tempSpecialTokens, options.SeparatorToken, lowerCase, notNormalizedSpecialTokens);
                     AddSpecialToken(vocab, tempSpecialTokens, options.PaddingToken, lowerCase, notNormalizedSpecialTokens);
@@ -804,7 +804,7 @@ private static BertTokenizer Create(
                 }
             }
 
-            // We set the PreTokenizer here using the normalized special tokens dict (if relevant), and therefore we can 
+            // We set the PreTokenizer here using the normalized special tokens dict (if relevant), and therefore we can
             // keep the not-normalized special tokens dict in the options passed to the WordPieceTokenizer.
             options.PreTokenizer ??= options.ApplyBasicTokenization ? PreTokenizer.CreateWordOrPunctuation(options.SplitOnSpecialTokens ? specialTokensDict : null) : PreTokenizer.CreateWhiteSpace();
 

diff --git a/test/Microsoft.ML.Tokenizers.Tests/BertTokenizerTests.cs b/test/Microsoft.ML.Tokenizers.Tests/BertTokenizerTests.cs
@@ -18,8 +18,8 @@ public class BertTokenizerTests
         public void TestWithLowerCasingExplicitSpecialTokens()
         {
             // Add [SPECIAL] token at end (to keep indices as is)
-            //                     Ids: 0        1        2        3        4     5      6    7      8        9      10      11     12,   13 
-            string[] vocabTokens = {"[PAD]", "[UNK]", "[CLS]", "[SEP]", "[MASK]", "!", ",", "?", "hello", "world", "how", "are", "you", "[SPECIAL]"};
+            //                     Ids: 0        1        2        3        4     5      6    7      8        9      10      11     12,   13
+            string[] vocabTokens = ["[PAD]", "[UNK]", "[CLS]", "[SEP]", "[MASK]", "!", ",", "?", "hello", "world", "how", "are", "you", "[SPECIAL]"];
 
             string vocabFile = WordPieceTests.CreateVocabFile(vocabTokens);