From 2a0ab38e2812d712267ca6bb663b0ab4b5ab6e09 Mon Sep 17 00:00:00 2001
From: Parmeet  Singh Bhatia <parmeetbhatia@fb.com>
Date: Mon, 19 Apr 2021 22:58:34 -0400
Subject: [PATCH 1/4] added tests for factory functions for experimental vocab
 and modified their API

---
 test/experimental/test_vocab.py | 32 +++++++++++++++++++++++++++++++-
 torchtext/experimental/vocab.py | 24 ++++++++++--------------
 2 files changed, 41 insertions(+), 15 deletions(-)
diff --git a/test/experimental/test_vocab.py b/test/experimental/test_vocab.py
index 879c03e72d..5976d782ea 100644
--- a/test/experimental/test_vocab.py
+++ b/test/experimental/test_vocab.py
@@ -5,9 +5,12 @@
 import torch
 import unittest
 from test.common.torchtext_test_case import TorchtextTestCase
+from torchtext.experimental.transforms import basic_english_normalize
 from torchtext.experimental.vocab import (
     vocab,
     build_vocab_from_iterator,
+    build_vocab_from_text_file,
+    load_vocab_from_file,
 )
 
 
@@ -217,9 +220,36 @@ def test_vocab_load_and_save(self):
             self.assertEqual(v.get_itos(), expected_itos)
             self.assertEqual(dict(loaded_v.get_stoi()), expected_stoi)
 
+    def test_build_vocab_from_vocab_file(self):
+        iterator = ['ᑌᑎIᑕOᗪᕮ_Tᕮ᙭T', 'hello', 'world', 'freq_low']
+        with self.subTest('buildfromvocabfile'):
+            vocab_path = os.path.join(self.test_dir, 'vocab.txt')
+            with open(vocab_path, 'w') as f:
+                f.write('\n'.join(iterator))
+            v = load_vocab_from_file(vocab_path)
+            expected_itos = ['<unk>', 'ᑌᑎIᑕOᗪᕮ_Tᕮ᙭T', 'hello', 'world', 'freq_low']
+            expected_stoi = {x: index for index, x in enumerate(expected_itos)}
+            self.assertEqual(v.get_itos(), expected_itos)
+            self.assertEqual(dict(v.get_stoi()), expected_stoi)
+
+    def test_build_vocab_from_text_file(self):
+        iterator = ['hello', 'hello', 'hello', 'freq_low', 'hello', 'world', 'world', 'world', 'ᑌᑎIᑕOᗪᕮ_Tᕮ᙭T',
+                    'ᑌᑎIᑕOᗪᕮ_Tᕮ᙭T', 'ᑌᑎIᑕOᗪᕮ_Tᕮ᙭T', 'freq_low', 'ᑌᑎIᑕOᗪᕮ_Tᕮ᙭T', 'ᑌᑎIᑕOᗪᕮ_Tᕮ᙭T']
+        with self.subTest('buildfromtextfile'):
+            vocab_path = os.path.join(self.test_dir, 'vocab.txt')
+            with open(vocab_path, 'w') as f:
+                f.write(' '.join(iterator))
+                f.write('\n')
+            tokenizer = torch.jit.script(basic_english_normalize())
+            v = build_vocab_from_text_file(vocab_path, tokenizer)
+            expected_itos = ['<unk>', 'ᑌᑎiᑕoᗪᕮ_tᕮ᙭t', 'hello', 'world', 'freq_low']
+            expected_stoi = {x: index for index, x in enumerate(expected_itos)}
+            self.assertEqual(v.get_itos(), expected_itos)
+            self.assertEqual(dict(v.get_stoi()), expected_stoi)
+
     def test_build_vocab_iterator(self):
         iterator = [['hello', 'hello', 'hello', 'freq_low', 'hello', 'world', 'world', 'world', 'ᑌᑎIᑕOᗪᕮ_Tᕮ᙭T',
-                    'ᑌᑎIᑕOᗪᕮ_Tᕮ᙭T', 'ᑌᑎIᑕOᗪᕮ_Tᕮ᙭T', 'freq_low', 'ᑌᑎIᑕOᗪᕮ_Tᕮ᙭T', 'ᑌᑎIᑕOᗪᕮ_Tᕮ᙭T']]
+                     'ᑌᑎIᑕOᗪᕮ_Tᕮ᙭T', 'ᑌᑎIᑕOᗪᕮ_Tᕮ᙭T', 'freq_low', 'ᑌᑎIᑕOᗪᕮ_Tᕮ᙭T', 'ᑌᑎIᑕOᗪᕮ_Tᕮ᙭T']]
         v = build_vocab_from_iterator(iterator)
         expected_itos = ['<unk>', 'ᑌᑎIᑕOᗪᕮ_Tᕮ᙭T', 'hello', 'world', 'freq_low']
         expected_stoi = {x: index for index, x in enumerate(expected_itos)}
diff --git a/torchtext/experimental/vocab.py b/torchtext/experimental/vocab.py
index 2f13b083d6..7045ba65b0 100644
--- a/torchtext/experimental/vocab.py
+++ b/torchtext/experimental/vocab.py
@@ -19,12 +19,11 @@
 logger = logging.getLogger(__name__)
 
 
-def build_vocab_from_text_file(file_object, jited_tokenizer, min_freq=1, unk_token='<unk>', num_cpus=4):
+def build_vocab_from_text_file(file_path, jited_tokenizer, min_freq=1, unk_token='<unk>', num_cpus=4):
     r"""Create a `Vocab` object from a raw text file.
 
-    The `file_object` can contain any raw text. This function applies a generic JITed tokenizer in
-    parallel to the text. Note that the vocab will be created in the order that the tokens first appear
-    in the file (and not by the frequency of tokens).
+    The `file_path` can contain any raw text. This function applies a generic JITed tokenizer in
+    parallel to the text.
 
     Args:
         file_object (FileObject): a file object to read data from.
@@ -40,20 +39,18 @@ def build_vocab_from_text_file(file_object, jited_tokenizer, min_freq=1, unk_tok
     Examples:
         >>> from torchtext.experimental.vocab import build_vocab_from_text_file
         >>> from torchtext.experimental.transforms import basic_english_normalize
-        >>> f = open('vocab.txt', 'r')
-        >>>     tokenizer = basic_english_normalize()
+        >>> tokenizer = basic_english_normalize()
         >>> tokenizer = basic_english_normalize()
         >>> jit_tokenizer = torch.jit.script(tokenizer)
-        >>> v = build_vocab_from_text_file(f, jit_tokenizer)
+        >>> v = build_vocab_from_text_file('vocab.txt', jit_tokenizer)
     """
-    vocab_obj = _build_vocab_from_text_file(file_object.name, unk_token, min_freq, num_cpus, jited_tokenizer)
+    vocab_obj = _build_vocab_from_text_file(file_path, unk_token, min_freq, num_cpus, jited_tokenizer)
     return Vocab(vocab_obj)
 
 
-def load_vocab_from_file(file_object, min_freq=1, unk_token='<unk>', num_cpus=4):
+def load_vocab_from_file(file_path, min_freq=1, unk_token='<unk>', num_cpus=4):
     r"""Create a `Vocab` object from a text file.
-    The `file_object` should contain tokens separated by new lines. Note that the vocab
-    will be created in the order that the tokens first appear in the file (and not by the frequency of tokens).
+    The `file_path` should contain tokens separated by new lines.
     Format for txt file:
 
         token1
@@ -73,11 +70,10 @@ def load_vocab_from_file(file_object, min_freq=1, unk_token='<unk>', num_cpus=4)
 
     Examples:
         >>> from torchtext.experimental.vocab import load_vocab_from_file
-        >>> f = open('vocab.txt', 'r')
-        >>> v = load_vocab_from_file(f)
+        >>> v = load_vocab_from_file('vocab.txt')
     """
 
-    vocab_obj = _load_vocab_from_file(file_object.name, unk_token, min_freq, num_cpus)
+    vocab_obj = _load_vocab_from_file(file_path, unk_token, min_freq, num_cpus)
     return Vocab(vocab_obj)
 
 

From d54d92fd385a4e790438e308ae8d97ce59ff5e89 Mon Sep 17 00:00:00 2001
From: Parmeet  Singh Bhatia <parmeetbhatia@fb.com>
Date: Tue, 20 Apr 2021 09:49:42 -0400
Subject: [PATCH 2/4] modified tests

---
 test/experimental/test_vocab.py      | 34 ++----------------
 test/experimental/test_with_asset.py | 54 +++++++++++++---------------
 2 files changed, 27 insertions(+), 61 deletions(-)

diff --git a/test/experimental/test_vocab.py b/test/experimental/test_vocab.py
index 5976d782ea..73f74bc16e 100644
--- a/test/experimental/test_vocab.py
+++ b/test/experimental/test_vocab.py
@@ -5,12 +5,9 @@
 import torch
 import unittest
 from test.common.torchtext_test_case import TorchtextTestCase
-from torchtext.experimental.transforms import basic_english_normalize
 from torchtext.experimental.vocab import (
     vocab,
     build_vocab_from_iterator,
-    build_vocab_from_text_file,
-    load_vocab_from_file,
 )
 
 
@@ -220,38 +217,11 @@ def test_vocab_load_and_save(self):
             self.assertEqual(v.get_itos(), expected_itos)
             self.assertEqual(dict(loaded_v.get_stoi()), expected_stoi)
 
-    def test_build_vocab_from_vocab_file(self):
-        iterator = ['ᑌᑎIᑕOᗪᕮ_Tᕮ᙭T', 'hello', 'world', 'freq_low']
-        with self.subTest('buildfromvocabfile'):
-            vocab_path = os.path.join(self.test_dir, 'vocab.txt')
-            with open(vocab_path, 'w') as f:
-                f.write('\n'.join(iterator))
-            v = load_vocab_from_file(vocab_path)
-            expected_itos = ['<unk>', 'ᑌᑎIᑕOᗪᕮ_Tᕮ᙭T', 'hello', 'world', 'freq_low']
-            expected_stoi = {x: index for index, x in enumerate(expected_itos)}
-            self.assertEqual(v.get_itos(), expected_itos)
-            self.assertEqual(dict(v.get_stoi()), expected_stoi)
-
-    def test_build_vocab_from_text_file(self):
-        iterator = ['hello', 'hello', 'hello', 'freq_low', 'hello', 'world', 'world', 'world', 'ᑌᑎIᑕOᗪᕮ_Tᕮ᙭T',
-                    'ᑌᑎIᑕOᗪᕮ_Tᕮ᙭T', 'ᑌᑎIᑕOᗪᕮ_Tᕮ᙭T', 'freq_low', 'ᑌᑎIᑕOᗪᕮ_Tᕮ᙭T', 'ᑌᑎIᑕOᗪᕮ_Tᕮ᙭T']
-        with self.subTest('buildfromtextfile'):
-            vocab_path = os.path.join(self.test_dir, 'vocab.txt')
-            with open(vocab_path, 'w') as f:
-                f.write(' '.join(iterator))
-                f.write('\n')
-            tokenizer = torch.jit.script(basic_english_normalize())
-            v = build_vocab_from_text_file(vocab_path, tokenizer)
-            expected_itos = ['<unk>', 'ᑌᑎiᑕoᗪᕮ_tᕮ᙭t', 'hello', 'world', 'freq_low']
-            expected_stoi = {x: index for index, x in enumerate(expected_itos)}
-            self.assertEqual(v.get_itos(), expected_itos)
-            self.assertEqual(dict(v.get_stoi()), expected_stoi)
-
     def test_build_vocab_iterator(self):
         iterator = [['hello', 'hello', 'hello', 'freq_low', 'hello', 'world', 'world', 'world', 'ᑌᑎIᑕOᗪᕮ_Tᕮ᙭T',
-                     'ᑌᑎIᑕOᗪᕮ_Tᕮ᙭T', 'ᑌᑎIᑕOᗪᕮ_Tᕮ᙭T', 'freq_low', 'ᑌᑎIᑕOᗪᕮ_Tᕮ᙭T', 'ᑌᑎIᑕOᗪᕮ_Tᕮ᙭T']]
+                    'ᑌᑎIᑕOᗪᕮ_Tᕮ᙭T', 'ᑌᑎIᑕOᗪᕮ_Tᕮ᙭T', 'freq_low', 'ᑌᑎIᑕOᗪᕮ_Tᕮ᙭T', 'ᑌᑎIᑕOᗪᕮ_Tᕮ᙭T']]
         v = build_vocab_from_iterator(iterator)
         expected_itos = ['<unk>', 'ᑌᑎIᑕOᗪᕮ_Tᕮ᙭T', 'hello', 'world', 'freq_low']
         expected_stoi = {x: index for index, x in enumerate(expected_itos)}
         self.assertEqual(v.get_itos(), expected_itos)
-        self.assertEqual(dict(v.get_stoi()), expected_stoi)
+        self.assertEqual(dict(v.get_stoi()), expected_stoi)
\ No newline at end of file
diff --git a/test/experimental/test_with_asset.py b/test/experimental/test_with_asset.py
index a9be86d81b..2055e0ab57 100644
--- a/test/experimental/test_with_asset.py
+++ b/test/experimental/test_with_asset.py
@@ -78,13 +78,12 @@ class TestTransformsWithAsset(TorchtextTestCase):
     def test_vocab_transform(self):
         asset_name = 'vocab_test2.txt'
         asset_path = get_asset_path(asset_name)
-        with open(asset_path, 'r') as f:
-            vocab_transform = VocabTransform(load_vocab_from_file(f))
-            self.assertEqual(vocab_transform(['of', 'that', 'new']),
-                             [7, 18, 24])
-            jit_vocab_transform = torch.jit.script(vocab_transform)
-            self.assertEqual(jit_vocab_transform(['of', 'that', 'new', 'that']),
-                             [7, 18, 24, 18])
+        vocab_transform = VocabTransform(load_vocab_from_file(asset_path))
+        self.assertEqual(vocab_transform(['of', 'that', 'new']),
+                         [7, 18, 24])
+        jit_vocab_transform = torch.jit.script(vocab_transform)
+        self.assertEqual(jit_vocab_transform(['of', 'that', 'new', 'that']),
+                         [7, 18, 24, 18])
 
     def test_errors_vectors_python(self):
         tokens = []
@@ -179,27 +178,25 @@ def test_glove_different_dims(self):
     def test_vocab_from_file(self):
         asset_name = 'vocab_test.txt'
         asset_path = get_asset_path(asset_name)
-        with open(asset_path, 'r') as f:
-            v = load_vocab_from_file(f, unk_token='<new_unk>')
-            expected_itos = ['<new_unk>', 'b', 'a', 'c']
-            expected_stoi = {x: index for index, x in enumerate(expected_itos)}
-            self.assertEqual(v.get_itos(), expected_itos)
-            self.assertEqual(dict(v.get_stoi()), expected_stoi)
+        v = load_vocab_from_file(asset_path, unk_token='<new_unk>')
+        expected_itos = ['<new_unk>', 'b', 'a', 'c']
+        expected_stoi = {x: index for index, x in enumerate(expected_itos)}
+        self.assertEqual(v.get_itos(), expected_itos)
+        self.assertEqual(dict(v.get_stoi()), expected_stoi)
 
     def test_vocab_from_raw_text_file(self):
         asset_name = 'vocab_raw_text_test.txt'
         asset_path = get_asset_path(asset_name)
-        with open(asset_path, 'r') as f:
-            tokenizer = basic_english_normalize()
-            jit_tokenizer = torch.jit.script(tokenizer)
-            v = build_vocab_from_text_file(f, jit_tokenizer, unk_token='<new_unk>')
-            expected_itos = ['<new_unk>', "'", 'after', 'talks', '.', 'are', 'at', 'disappointed',
-                             'fears', 'federal', 'firm', 'for', 'mogul', 'n', 'newall', 'parent',
-                             'pension', 'representing', 'say', 'stricken', 't', 'they', 'turner',
-                             'unions', 'with', 'workers']
-            expected_stoi = {x: index for index, x in enumerate(expected_itos)}
-            self.assertEqual(v.get_itos(), expected_itos)
-            self.assertEqual(dict(v.get_stoi()), expected_stoi)
+        tokenizer = basic_english_normalize()
+        jit_tokenizer = torch.jit.script(tokenizer)
+        v = build_vocab_from_text_file(asset_path, jit_tokenizer, unk_token='<new_unk>')
+        expected_itos = ['<new_unk>', "'", 'after', 'talks', '.', 'are', 'at', 'disappointed',
+                         'fears', 'federal', 'firm', 'for', 'mogul', 'n', 'newall', 'parent',
+                         'pension', 'representing', 'say', 'stricken', 't', 'they', 'turner',
+                         'unions', 'with', 'workers']
+        expected_stoi = {x: index for index, x in enumerate(expected_itos)}
+        self.assertEqual(v.get_itos(), expected_itos)
+        self.assertEqual(dict(v.get_stoi()), expected_stoi)
 
     def test_builtin_pretrained_sentencepiece_processor(self):
         sp_model_path = download_from_url(PRETRAINED_SP_MODEL['text_unigram_25000'])
@@ -241,11 +238,10 @@ def batch_func(data):
     def test_text_sequential_transform(self):
         asset_name = 'vocab_test2.txt'
         asset_path = get_asset_path(asset_name)
-        with open(asset_path, 'r') as f:
-            pipeline = TextSequentialTransforms(basic_english_normalize(), load_vocab_from_file(f))
-            jit_pipeline = torch.jit.script(pipeline)
-            self.assertEqual(pipeline('of that new'), [7, 18, 24])
-            self.assertEqual(jit_pipeline('of that new'), [7, 18, 24])
+        pipeline = TextSequentialTransforms(basic_english_normalize(), load_vocab_from_file(asset_path))
+        jit_pipeline = torch.jit.script(pipeline)
+        self.assertEqual(pipeline('of that new'), [7, 18, 24])
+        self.assertEqual(jit_pipeline('of that new'), [7, 18, 24])
 
     def test_vectors_from_file(self):
         asset_name = 'vectors_test.csv'

From 0ba55ef0c28ed4c7a880a91f74c46d2b1f9aab34 Mon Sep 17 00:00:00 2001
From: Parmeet  Singh Bhatia <parmeetbhatia@fb.com>
Date: Tue, 20 Apr 2021 09:50:59 -0400
Subject: [PATCH 3/4] minor linter issue

---
 test/experimental/test_vocab.py | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

diff --git a/test/experimental/test_vocab.py b/test/experimental/test_vocab.py
index 73f74bc16e..85c58ea67d 100644
--- a/test/experimental/test_vocab.py
+++ b/test/experimental/test_vocab.py
@@ -219,9 +219,9 @@ def test_vocab_load_and_save(self):
 
     def test_build_vocab_iterator(self):
         iterator = [['hello', 'hello', 'hello', 'freq_low', 'hello', 'world', 'world', 'world', 'ᑌᑎIᑕOᗪᕮ_Tᕮ᙭T',
-                    'ᑌᑎIᑕOᗪᕮ_Tᕮ᙭T', 'ᑌᑎIᑕOᗪᕮ_Tᕮ᙭T', 'freq_low', 'ᑌᑎIᑕOᗪᕮ_Tᕮ᙭T', 'ᑌᑎIᑕOᗪᕮ_Tᕮ᙭T']]
+                     'ᑌᑎIᑕOᗪᕮ_Tᕮ᙭T', 'ᑌᑎIᑕOᗪᕮ_Tᕮ᙭T', 'freq_low', 'ᑌᑎIᑕOᗪᕮ_Tᕮ᙭T', 'ᑌᑎIᑕOᗪᕮ_Tᕮ᙭T']]
         v = build_vocab_from_iterator(iterator)
         expected_itos = ['<unk>', 'ᑌᑎIᑕOᗪᕮ_Tᕮ᙭T', 'hello', 'world', 'freq_low']
         expected_stoi = {x: index for index, x in enumerate(expected_itos)}
         self.assertEqual(v.get_itos(), expected_itos)
-        self.assertEqual(dict(v.get_stoi()), expected_stoi)
\ No newline at end of file
+        self.assertEqual(dict(v.get_stoi()), expected_stoi)

From de8986446192c4931ae9af9e800d89be8e63d124 Mon Sep 17 00:00:00 2001
From: Parmeet Bhatia <bhatia.parmeet@gmail.com>
Date: Wed, 21 Apr 2021 11:33:41 -0700
Subject: [PATCH 4/4] added line-by-line lookup benchmark

---
 benchmark/benchmark_experimental_vocab.py | 43 +++++++++++++++++++++--
 1 file changed, 41 insertions(+), 2 deletions(-)

diff --git a/benchmark/benchmark_experimental_vocab.py b/benchmark/benchmark_experimental_vocab.py
index 3cd4e49a1d..a19e6267d8 100644
--- a/benchmark/benchmark_experimental_vocab.py
+++ b/benchmark/benchmark_experimental_vocab.py
@@ -1,7 +1,10 @@
 import argparse
 from collections import (Counter, OrderedDict)
 import time
-
+from timeit import default_timer as timer
+import random
+import string
+from matplotlib import pyplot as plt
 import torch
 from torchtext.experimental.datasets import DATASETS
 from torchtext.experimental.vocab import (
@@ -16,6 +19,42 @@
 from torchtext.experimental.transforms import basic_english_normalize
 
 
+def compare_legacy_and_experimental_batch_lookup():
+    num_tokens = 1000
+    num_letters = 6
+    num_lines = 100000
+    vocab = [''.join(random.sample(string.ascii_letters * num_letters, num_letters)) for _ in range(num_tokens)]
+    counter = Counter()
+    counter.update(vocab)
+    legacy_vocab = Vocab(counter)
+    experimental_vocab = VocabExperimental(counter)
+    speed_ups = []
+    token_lengths = [i for i in range(2, 100)]
+    for i in token_lengths:
+        lines = [random.sample(vocab, i) for _ in range(num_lines)]
+        start_time = timer()
+        for l in lines:
+            legacy_vocab.lookup_indices(l)
+        legacy_time = timer() - start_time
+
+        start_time = timer()
+        for l in lines:
+            experimental_vocab.lookup_indices(l)
+
+        experimental_time = timer() - start_time
+
+        speed_ups.append(legacy_time / experimental_time)
+        print("speed-up={} for average length={}".format(legacy_time / experimental_time, i))
+        del lines
+
+    plt.close()
+    fig, ax = plt.subplots(1,1)
+    ax.plot(token_lengths, speed_ups)
+    ax.set_xlabel('Average Tokens per line')
+    ax.set_ylabel('Speed-up')
+    plt.savefig("speedup.jpg")
+
+
 def legacy_vocab_from_file_object(file_like_object, **kwargs):
     r"""Create a `Vocab` object from a file like object.
 
@@ -76,7 +115,7 @@ def benchmark_experimental_vocab_construction(vocab_file_path, is_raw_text=True,
         print("Construction time:", time.monotonic() - t0)
 
 
-def benchmark_experimental_vocab_lookup(vocab_file_path=None, dataset = 'AG_NEWS'):
+def benchmark_experimental_vocab_lookup(vocab_file_path=None, dataset='AG_NEWS'):
     def _run_benchmark_lookup(tokens, vocab):
         t0 = time.monotonic()
         # list lookup