pytorch · parmeet · May 7, 2021
diff --git a/test/experimental/test_vocab.py b/test/experimental/test_vocab.py
@@ -76,6 +76,10 @@ def test_vocab_insert_token(self):
 
         self.assertEqual(v.get_itos(), expected_itos)
         self.assertEqual(dict(v.get_stoi()), expected_stoi)
+        with self.assertRaises(RuntimeError) as context:
+            v.insert_token('b', 0)
+
+        self.assertTrue("Token b already exists in the Vocab with index: 0" in str(context.exception))
 
     def test_vocab_append_token(self):
         c = OrderedDict({'a': 2})
@@ -88,6 +92,11 @@ def test_vocab_append_token(self):
         self.assertEqual(v.get_itos(), expected_itos)
         self.assertEqual(dict(v.get_stoi()), expected_stoi)
 
+        with self.assertRaises(RuntimeError) as context:
+            v.append_token('b')
+
+        self.assertTrue("Token b already exists in the Vocab with index: 2" in str(context.exception))
+
     def test_vocab_len(self):
         token_to_freq = {'a': 2, 'b': 2, 'c': 2}
         sorted_by_freq_tuples = sorted(token_to_freq.items(), key=lambda x: x[1], reverse=True)

diff --git a/torchtext/csrc/vocab.cpp b/torchtext/csrc/vocab.cpp
@@ -38,7 +38,6 @@ bool Vocab::__contains__(const c10::string_view &token) const {
   return false;
 }
 
-
 int64_t Vocab::__getitem__(const c10::string_view &token) const {
   int64_t id = _find(token);
   if (stoi_[id] != -1) {
@@ -47,7 +46,22 @@ int64_t Vocab::__getitem__(const c10::string_view &token) const {
   return unk_index_;
 }
 
-void Vocab::append_token(const std::string &token) { _add(token); }
+void Vocab::append_token(const std::string &token) {
+  // if item already in stoi we throw an error
+  auto token_position = _find(c10::string_view{token.data(), token.size()});
+  if (stoi_[token_position] != -1) {
+#ifdef _MSC_VER
+    std::cerr << "[RuntimeError] Token " << token
+              << " already exists in the Vocab with index: "
+              << stoi_[token_position] << std::endl;
+#endif
+    throw std::runtime_error("Token " + token +
+                             " already exists in the Vocab with index: " +
+                             std::to_string(stoi_[token_position]) + ".");
+  }
+
+  _add(token);
+}
 
 void Vocab::insert_token(const std::string &token, const int64_t &index) {
   if (index < 0 || index > itos_.size()) {

diff --git a/torchtext/csrc/vocab.h b/torchtext/csrc/vocab.h
@@ -44,7 +44,7 @@ struct Vocab : torch::CustomClassHolder {
   uint32_t _find(const c10::string_view &w) const {
     uint32_t stoi_size = stoi_.size();
     uint32_t id = _hash(w) % stoi_size;
-    while (stoi_[id] != -1 && itos_[stoi_[id]]!= w) {
+    while (stoi_[id] != -1 && itos_[stoi_[id]] != w) {
       id = (id + 1) % stoi_size;
     }
     return id;

diff --git a/torchtext/experimental/vocab.py b/torchtext/experimental/vocab.py
@@ -214,6 +214,9 @@ def append_token(self, token: str) -> None:
         r"""
         Args:
             token (str): the token used to lookup the corresponding index.
+
+        Raises:
+            RuntimeError: if token already exists in the vocab
         """
         self.vocab.append_token(token)
-Original file line number
+Diff line change
@@ Expand Up / @@ -214,6 +214,9 @@ def append_token(self, token: str) -> None: @@
             r"""
             Args:
                 token (str): the token used to lookup the corresponding index.
+            Raises:
+                RuntimeError: if token already exists in the vocab
             """
             self.vocab.append_token(token)
@@ Expand Down @@