fixes from PR comments

dotnet · michaelgsharp · Jan 6, 2024 · Dec 12, 2023 · Dec 24, 2023 · Jan 3, 2024
commit 9fd094c5cbb1eda27b5e48c0f46b74613e97444c
diff --git a/src/Microsoft.ML.Tokenizers/Model/EnglishRoberta.cs b/src/Microsoft.ML.Tokenizers/Model/EnglishRoberta.cs
@@ -27,7 +27,6 @@ public sealed class EnglishRoberta : Model
         private readonly IReadOnlyDictionary<char, char> _unicodeToByte;
         private readonly string[] _charToString;
         private readonly Cache<string, IReadOnlyList<Token>> _cache;
-        private const char StartChar = (char)(' ' + 256);
 
         /// <summary>
         /// Construct tokenizer object to use with the English Robert model.
@@ -593,13 +592,6 @@ public override bool IsValidChar(char ch)
         {
             return _byteToUnicode.ContainsKey(ch);
         }
-
-        public override bool IsFirstTokenInWord(string token)
-        {
-            if (token == null)
-                return true;
-            return token.Length != 0 && token[0] == _startChar;
-        }
     }
 
     /// <summary>

diff --git a/src/Microsoft.ML.Tokenizers/Model/Model.cs b/src/Microsoft.ML.Tokenizers/Model/Model.cs
@@ -66,13 +66,5 @@ public abstract class Model
         /// <param name="ch"></param>
         /// <returns></returns>
         public abstract bool IsValidChar(char ch);
-
-        /// <summary>
-        /// Returns if the first character of the token is part of the actual word or not.
-        /// </summary>
-        /// <param name="token"></param>
-        /// <returns></returns>
-        public abstract bool IsFirstTokenInWord(string token);
     }
-
 }
diff --git a/src/Microsoft.ML.TorchSharp/NasBert/NasBertTrainer.cs b/src/Microsoft.ML.TorchSharp/NasBert/NasBertTrainer.cs
@@ -584,7 +584,9 @@ private IList<int> PrepInputTokens(ref ReadOnlyMemory<char> sentence1, ref ReadO
                 getSentence1(ref sentence1);
                 if (getSentence2 == default)
                 {
-                    return InitTokenArray.Concat(tokenizer.EncodeToConverted(sentence1.ToString())).ToList();
+                    List<int> newList = new List<int>(tokenizer.EncodeToConverted(sentence1.ToString()));
+                    newList.Insert(0, 0);
+                    return newList;
                 }
                 else
                 {

diff --git a/src/Microsoft.ML.TorchSharp/NasBert/NerTrainer.cs b/src/Microsoft.ML.TorchSharp/NasBert/NerTrainer.cs
@@ -62,6 +62,8 @@ namespace Microsoft.ML.TorchSharp.NasBert
     ///
     public class NerTrainer : NasBertTrainer<VBuffer<uint>, TargetType>
     {
+        private const char StartChar = (char)(' ' + 256);
+
         public class NerOptions : NasBertOptions
         {
             public NerOptions()
@@ -109,6 +111,8 @@ private protected override TorchSharpBaseTransformer<VBuffer<uint>, TargetType>
             return new NerTransformer(host, options as NasBertOptions, model as NasBertModel, labelColumn);
         }
 
+        internal static bool TokenStartsWithSpace(string token) => token is null || (token.Length != 0 && token[0] == StartChar);
+
         private protected class Trainer : NasBertTrainerBase
         {
             private const string ModelUrlString = "models/pretrained_NasBert_14M_encoder.tsm";
@@ -172,7 +176,7 @@ private protected override torch.Tensor PrepareRowTensor(ref VBuffer<uint> targe
                     var newValues = targetEditor.Values;
                     for (var i = 0; i < encoding.Tokens.Count; i++)
                     {
-                        if (Tokenizer.Model.IsFirstTokenInWord(encoding.Tokens[i]))
+                        if (NerTrainer.TokenStartsWithSpace(encoding.Tokens[i]))
                         {
                             newValues[i] = target.GetItemOrDefault(++targetIndex);
                         }
@@ -382,7 +386,7 @@ private void CondenseOutput(ref VBuffer<UInt32> dst, string sentence, Tokenizer
                 // Figure out actual count of output tokens
                 for (var i = 0; i < encoding.Tokens.Count; i++)
                 {
-                    if (tokenizer.Model.IsFirstTokenInWord(encoding.Tokens[i]))
+                    if (NerTrainer.TokenStartsWithSpace(encoding.Tokens[i]))
                     {
                         targetIndex++;
                     }
@@ -396,7 +400,7 @@ private void CondenseOutput(ref VBuffer<UInt32> dst, string sentence, Tokenizer
 
                 for (var i = 1; i < encoding.Tokens.Count; i++)
                 {
-                    if (tokenizer.Model.IsFirstTokenInWord(encoding.Tokens[i]))
+                    if (NerTrainer.TokenStartsWithSpace(encoding.Tokens[i]))
                     {
                         newValues[targetIndex++] = (uint)prediction[i];
                     }