From 2a0ab38e2812d712267ca6bb663b0ab4b5ab6e09 Mon Sep 17 00:00:00 2001 From: Parmeet Singh Bhatia Date: Mon, 19 Apr 2021 22:58:34 -0400 Subject: [PATCH 1/4] added tests for factory functions for experimental vocab and modified their API --- test/experimental/test_vocab.py | 32 +++++++++++++++++++++++++++++++- torchtext/experimental/vocab.py | 24 ++++++++++-------------- 2 files changed, 41 insertions(+), 15 deletions(-) diff --git a/test/experimental/test_vocab.py b/test/experimental/test_vocab.py index 879c03e72d..5976d782ea 100644 --- a/test/experimental/test_vocab.py +++ b/test/experimental/test_vocab.py @@ -5,9 +5,12 @@ import torch import unittest from test.common.torchtext_test_case import TorchtextTestCase +from torchtext.experimental.transforms import basic_english_normalize from torchtext.experimental.vocab import ( vocab, build_vocab_from_iterator, + build_vocab_from_text_file, + load_vocab_from_file, ) @@ -217,9 +220,36 @@ def test_vocab_load_and_save(self): self.assertEqual(v.get_itos(), expected_itos) self.assertEqual(dict(loaded_v.get_stoi()), expected_stoi) + def test_build_vocab_from_vocab_file(self): + iterator = ['ᑌᑎIᑕOᗪᕮ_Tᕮ᙭T', 'hello', 'world', 'freq_low'] + with self.subTest('buildfromvocabfile'): + vocab_path = os.path.join(self.test_dir, 'vocab.txt') + with open(vocab_path, 'w') as f: + f.write('\n'.join(iterator)) + v = load_vocab_from_file(vocab_path) + expected_itos = ['', 'ᑌᑎIᑕOᗪᕮ_Tᕮ᙭T', 'hello', 'world', 'freq_low'] + expected_stoi = {x: index for index, x in enumerate(expected_itos)} + self.assertEqual(v.get_itos(), expected_itos) + self.assertEqual(dict(v.get_stoi()), expected_stoi) + + def test_build_vocab_from_text_file(self): + iterator = ['hello', 'hello', 'hello', 'freq_low', 'hello', 'world', 'world', 'world', 'ᑌᑎIᑕOᗪᕮ_Tᕮ᙭T', + 'ᑌᑎIᑕOᗪᕮ_Tᕮ᙭T', 'ᑌᑎIᑕOᗪᕮ_Tᕮ᙭T', 'freq_low', 'ᑌᑎIᑕOᗪᕮ_Tᕮ᙭T', 'ᑌᑎIᑕOᗪᕮ_Tᕮ᙭T'] + with self.subTest('buildfromtextfile'): + vocab_path = os.path.join(self.test_dir, 'vocab.txt') + with open(vocab_path, 'w') as f: + f.write(' '.join(iterator)) + f.write('\n') + tokenizer = torch.jit.script(basic_english_normalize()) + v = build_vocab_from_text_file(vocab_path, tokenizer) + expected_itos = ['', 'ᑌᑎiᑕoᗪᕮ_tᕮ᙭t', 'hello', 'world', 'freq_low'] + expected_stoi = {x: index for index, x in enumerate(expected_itos)} + self.assertEqual(v.get_itos(), expected_itos) + self.assertEqual(dict(v.get_stoi()), expected_stoi) + def test_build_vocab_iterator(self): iterator = [['hello', 'hello', 'hello', 'freq_low', 'hello', 'world', 'world', 'world', 'ᑌᑎIᑕOᗪᕮ_Tᕮ᙭T', - 'ᑌᑎIᑕOᗪᕮ_Tᕮ᙭T', 'ᑌᑎIᑕOᗪᕮ_Tᕮ᙭T', 'freq_low', 'ᑌᑎIᑕOᗪᕮ_Tᕮ᙭T', 'ᑌᑎIᑕOᗪᕮ_Tᕮ᙭T']] + 'ᑌᑎIᑕOᗪᕮ_Tᕮ᙭T', 'ᑌᑎIᑕOᗪᕮ_Tᕮ᙭T', 'freq_low', 'ᑌᑎIᑕOᗪᕮ_Tᕮ᙭T', 'ᑌᑎIᑕOᗪᕮ_Tᕮ᙭T']] v = build_vocab_from_iterator(iterator) expected_itos = ['', 'ᑌᑎIᑕOᗪᕮ_Tᕮ᙭T', 'hello', 'world', 'freq_low'] expected_stoi = {x: index for index, x in enumerate(expected_itos)} diff --git a/torchtext/experimental/vocab.py b/torchtext/experimental/vocab.py index 2f13b083d6..7045ba65b0 100644 --- a/torchtext/experimental/vocab.py +++ b/torchtext/experimental/vocab.py @@ -19,12 +19,11 @@ logger = logging.getLogger(__name__) -def build_vocab_from_text_file(file_object, jited_tokenizer, min_freq=1, unk_token='', num_cpus=4): +def build_vocab_from_text_file(file_path, jited_tokenizer, min_freq=1, unk_token='', num_cpus=4): r"""Create a `Vocab` object from a raw text file. - The `file_object` can contain any raw text. This function applies a generic JITed tokenizer in - parallel to the text. Note that the vocab will be created in the order that the tokens first appear - in the file (and not by the frequency of tokens). + The `file_path` can contain any raw text. This function applies a generic JITed tokenizer in + parallel to the text. Args: file_object (FileObject): a file object to read data from. @@ -40,20 +39,18 @@ def build_vocab_from_text_file(file_object, jited_tokenizer, min_freq=1, unk_tok Examples: >>> from torchtext.experimental.vocab import build_vocab_from_text_file >>> from torchtext.experimental.transforms import basic_english_normalize - >>> f = open('vocab.txt', 'r') - >>> tokenizer = basic_english_normalize() + >>> tokenizer = basic_english_normalize() >>> tokenizer = basic_english_normalize() >>> jit_tokenizer = torch.jit.script(tokenizer) - >>> v = build_vocab_from_text_file(f, jit_tokenizer) + >>> v = build_vocab_from_text_file('vocab.txt', jit_tokenizer) """ - vocab_obj = _build_vocab_from_text_file(file_object.name, unk_token, min_freq, num_cpus, jited_tokenizer) + vocab_obj = _build_vocab_from_text_file(file_path, unk_token, min_freq, num_cpus, jited_tokenizer) return Vocab(vocab_obj) -def load_vocab_from_file(file_object, min_freq=1, unk_token='', num_cpus=4): +def load_vocab_from_file(file_path, min_freq=1, unk_token='', num_cpus=4): r"""Create a `Vocab` object from a text file. - The `file_object` should contain tokens separated by new lines. Note that the vocab - will be created in the order that the tokens first appear in the file (and not by the frequency of tokens). + The `file_path` should contain tokens separated by new lines. Format for txt file: token1 @@ -73,11 +70,10 @@ def load_vocab_from_file(file_object, min_freq=1, unk_token='', num_cpus=4) Examples: >>> from torchtext.experimental.vocab import load_vocab_from_file - >>> f = open('vocab.txt', 'r') - >>> v = load_vocab_from_file(f) + >>> v = load_vocab_from_file('vocab.txt') """ - vocab_obj = _load_vocab_from_file(file_object.name, unk_token, min_freq, num_cpus) + vocab_obj = _load_vocab_from_file(file_path, unk_token, min_freq, num_cpus) return Vocab(vocab_obj) From d54d92fd385a4e790438e308ae8d97ce59ff5e89 Mon Sep 17 00:00:00 2001 From: Parmeet Singh Bhatia Date: Tue, 20 Apr 2021 09:49:42 -0400 Subject: [PATCH 2/4] modified tests --- test/experimental/test_vocab.py | 34 ++---------------- test/experimental/test_with_asset.py | 54 +++++++++++++--------------- 2 files changed, 27 insertions(+), 61 deletions(-) diff --git a/test/experimental/test_vocab.py b/test/experimental/test_vocab.py index 5976d782ea..73f74bc16e 100644 --- a/test/experimental/test_vocab.py +++ b/test/experimental/test_vocab.py @@ -5,12 +5,9 @@ import torch import unittest from test.common.torchtext_test_case import TorchtextTestCase -from torchtext.experimental.transforms import basic_english_normalize from torchtext.experimental.vocab import ( vocab, build_vocab_from_iterator, - build_vocab_from_text_file, - load_vocab_from_file, ) @@ -220,38 +217,11 @@ def test_vocab_load_and_save(self): self.assertEqual(v.get_itos(), expected_itos) self.assertEqual(dict(loaded_v.get_stoi()), expected_stoi) - def test_build_vocab_from_vocab_file(self): - iterator = ['ᑌᑎIᑕOᗪᕮ_Tᕮ᙭T', 'hello', 'world', 'freq_low'] - with self.subTest('buildfromvocabfile'): - vocab_path = os.path.join(self.test_dir, 'vocab.txt') - with open(vocab_path, 'w') as f: - f.write('\n'.join(iterator)) - v = load_vocab_from_file(vocab_path) - expected_itos = ['', 'ᑌᑎIᑕOᗪᕮ_Tᕮ᙭T', 'hello', 'world', 'freq_low'] - expected_stoi = {x: index for index, x in enumerate(expected_itos)} - self.assertEqual(v.get_itos(), expected_itos) - self.assertEqual(dict(v.get_stoi()), expected_stoi) - - def test_build_vocab_from_text_file(self): - iterator = ['hello', 'hello', 'hello', 'freq_low', 'hello', 'world', 'world', 'world', 'ᑌᑎIᑕOᗪᕮ_Tᕮ᙭T', - 'ᑌᑎIᑕOᗪᕮ_Tᕮ᙭T', 'ᑌᑎIᑕOᗪᕮ_Tᕮ᙭T', 'freq_low', 'ᑌᑎIᑕOᗪᕮ_Tᕮ᙭T', 'ᑌᑎIᑕOᗪᕮ_Tᕮ᙭T'] - with self.subTest('buildfromtextfile'): - vocab_path = os.path.join(self.test_dir, 'vocab.txt') - with open(vocab_path, 'w') as f: - f.write(' '.join(iterator)) - f.write('\n') - tokenizer = torch.jit.script(basic_english_normalize()) - v = build_vocab_from_text_file(vocab_path, tokenizer) - expected_itos = ['', 'ᑌᑎiᑕoᗪᕮ_tᕮ᙭t', 'hello', 'world', 'freq_low'] - expected_stoi = {x: index for index, x in enumerate(expected_itos)} - self.assertEqual(v.get_itos(), expected_itos) - self.assertEqual(dict(v.get_stoi()), expected_stoi) - def test_build_vocab_iterator(self): iterator = [['hello', 'hello', 'hello', 'freq_low', 'hello', 'world', 'world', 'world', 'ᑌᑎIᑕOᗪᕮ_Tᕮ᙭T', - 'ᑌᑎIᑕOᗪᕮ_Tᕮ᙭T', 'ᑌᑎIᑕOᗪᕮ_Tᕮ᙭T', 'freq_low', 'ᑌᑎIᑕOᗪᕮ_Tᕮ᙭T', 'ᑌᑎIᑕOᗪᕮ_Tᕮ᙭T']] + 'ᑌᑎIᑕOᗪᕮ_Tᕮ᙭T', 'ᑌᑎIᑕOᗪᕮ_Tᕮ᙭T', 'freq_low', 'ᑌᑎIᑕOᗪᕮ_Tᕮ᙭T', 'ᑌᑎIᑕOᗪᕮ_Tᕮ᙭T']] v = build_vocab_from_iterator(iterator) expected_itos = ['', 'ᑌᑎIᑕOᗪᕮ_Tᕮ᙭T', 'hello', 'world', 'freq_low'] expected_stoi = {x: index for index, x in enumerate(expected_itos)} self.assertEqual(v.get_itos(), expected_itos) - self.assertEqual(dict(v.get_stoi()), expected_stoi) + self.assertEqual(dict(v.get_stoi()), expected_stoi) \ No newline at end of file diff --git a/test/experimental/test_with_asset.py b/test/experimental/test_with_asset.py index a9be86d81b..2055e0ab57 100644 --- a/test/experimental/test_with_asset.py +++ b/test/experimental/test_with_asset.py @@ -78,13 +78,12 @@ class TestTransformsWithAsset(TorchtextTestCase): def test_vocab_transform(self): asset_name = 'vocab_test2.txt' asset_path = get_asset_path(asset_name) - with open(asset_path, 'r') as f: - vocab_transform = VocabTransform(load_vocab_from_file(f)) - self.assertEqual(vocab_transform(['of', 'that', 'new']), - [7, 18, 24]) - jit_vocab_transform = torch.jit.script(vocab_transform) - self.assertEqual(jit_vocab_transform(['of', 'that', 'new', 'that']), - [7, 18, 24, 18]) + vocab_transform = VocabTransform(load_vocab_from_file(asset_path)) + self.assertEqual(vocab_transform(['of', 'that', 'new']), + [7, 18, 24]) + jit_vocab_transform = torch.jit.script(vocab_transform) + self.assertEqual(jit_vocab_transform(['of', 'that', 'new', 'that']), + [7, 18, 24, 18]) def test_errors_vectors_python(self): tokens = [] @@ -179,27 +178,25 @@ def test_glove_different_dims(self): def test_vocab_from_file(self): asset_name = 'vocab_test.txt' asset_path = get_asset_path(asset_name) - with open(asset_path, 'r') as f: - v = load_vocab_from_file(f, unk_token='') - expected_itos = ['', 'b', 'a', 'c'] - expected_stoi = {x: index for index, x in enumerate(expected_itos)} - self.assertEqual(v.get_itos(), expected_itos) - self.assertEqual(dict(v.get_stoi()), expected_stoi) + v = load_vocab_from_file(asset_path, unk_token='') + expected_itos = ['', 'b', 'a', 'c'] + expected_stoi = {x: index for index, x in enumerate(expected_itos)} + self.assertEqual(v.get_itos(), expected_itos) + self.assertEqual(dict(v.get_stoi()), expected_stoi) def test_vocab_from_raw_text_file(self): asset_name = 'vocab_raw_text_test.txt' asset_path = get_asset_path(asset_name) - with open(asset_path, 'r') as f: - tokenizer = basic_english_normalize() - jit_tokenizer = torch.jit.script(tokenizer) - v = build_vocab_from_text_file(f, jit_tokenizer, unk_token='') - expected_itos = ['', "'", 'after', 'talks', '.', 'are', 'at', 'disappointed', - 'fears', 'federal', 'firm', 'for', 'mogul', 'n', 'newall', 'parent', - 'pension', 'representing', 'say', 'stricken', 't', 'they', 'turner', - 'unions', 'with', 'workers'] - expected_stoi = {x: index for index, x in enumerate(expected_itos)} - self.assertEqual(v.get_itos(), expected_itos) - self.assertEqual(dict(v.get_stoi()), expected_stoi) + tokenizer = basic_english_normalize() + jit_tokenizer = torch.jit.script(tokenizer) + v = build_vocab_from_text_file(asset_path, jit_tokenizer, unk_token='') + expected_itos = ['', "'", 'after', 'talks', '.', 'are', 'at', 'disappointed', + 'fears', 'federal', 'firm', 'for', 'mogul', 'n', 'newall', 'parent', + 'pension', 'representing', 'say', 'stricken', 't', 'they', 'turner', + 'unions', 'with', 'workers'] + expected_stoi = {x: index for index, x in enumerate(expected_itos)} + self.assertEqual(v.get_itos(), expected_itos) + self.assertEqual(dict(v.get_stoi()), expected_stoi) def test_builtin_pretrained_sentencepiece_processor(self): sp_model_path = download_from_url(PRETRAINED_SP_MODEL['text_unigram_25000']) @@ -241,11 +238,10 @@ def batch_func(data): def test_text_sequential_transform(self): asset_name = 'vocab_test2.txt' asset_path = get_asset_path(asset_name) - with open(asset_path, 'r') as f: - pipeline = TextSequentialTransforms(basic_english_normalize(), load_vocab_from_file(f)) - jit_pipeline = torch.jit.script(pipeline) - self.assertEqual(pipeline('of that new'), [7, 18, 24]) - self.assertEqual(jit_pipeline('of that new'), [7, 18, 24]) + pipeline = TextSequentialTransforms(basic_english_normalize(), load_vocab_from_file(asset_path)) + jit_pipeline = torch.jit.script(pipeline) + self.assertEqual(pipeline('of that new'), [7, 18, 24]) + self.assertEqual(jit_pipeline('of that new'), [7, 18, 24]) def test_vectors_from_file(self): asset_name = 'vectors_test.csv' From 0ba55ef0c28ed4c7a880a91f74c46d2b1f9aab34 Mon Sep 17 00:00:00 2001 From: Parmeet Singh Bhatia Date: Tue, 20 Apr 2021 09:50:59 -0400 Subject: [PATCH 3/4] minor linter issue --- test/experimental/test_vocab.py | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/test/experimental/test_vocab.py b/test/experimental/test_vocab.py index 73f74bc16e..85c58ea67d 100644 --- a/test/experimental/test_vocab.py +++ b/test/experimental/test_vocab.py @@ -219,9 +219,9 @@ def test_vocab_load_and_save(self): def test_build_vocab_iterator(self): iterator = [['hello', 'hello', 'hello', 'freq_low', 'hello', 'world', 'world', 'world', 'ᑌᑎIᑕOᗪᕮ_Tᕮ᙭T', - 'ᑌᑎIᑕOᗪᕮ_Tᕮ᙭T', 'ᑌᑎIᑕOᗪᕮ_Tᕮ᙭T', 'freq_low', 'ᑌᑎIᑕOᗪᕮ_Tᕮ᙭T', 'ᑌᑎIᑕOᗪᕮ_Tᕮ᙭T']] + 'ᑌᑎIᑕOᗪᕮ_Tᕮ᙭T', 'ᑌᑎIᑕOᗪᕮ_Tᕮ᙭T', 'freq_low', 'ᑌᑎIᑕOᗪᕮ_Tᕮ᙭T', 'ᑌᑎIᑕOᗪᕮ_Tᕮ᙭T']] v = build_vocab_from_iterator(iterator) expected_itos = ['', 'ᑌᑎIᑕOᗪᕮ_Tᕮ᙭T', 'hello', 'world', 'freq_low'] expected_stoi = {x: index for index, x in enumerate(expected_itos)} self.assertEqual(v.get_itos(), expected_itos) - self.assertEqual(dict(v.get_stoi()), expected_stoi) \ No newline at end of file + self.assertEqual(dict(v.get_stoi()), expected_stoi) From de8986446192c4931ae9af9e800d89be8e63d124 Mon Sep 17 00:00:00 2001 From: Parmeet Bhatia Date: Wed, 21 Apr 2021 11:33:41 -0700 Subject: [PATCH 4/4] added line-by-line lookup benchmark --- benchmark/benchmark_experimental_vocab.py | 43 +++++++++++++++++++++-- 1 file changed, 41 insertions(+), 2 deletions(-) diff --git a/benchmark/benchmark_experimental_vocab.py b/benchmark/benchmark_experimental_vocab.py index 3cd4e49a1d..a19e6267d8 100644 --- a/benchmark/benchmark_experimental_vocab.py +++ b/benchmark/benchmark_experimental_vocab.py @@ -1,7 +1,10 @@ import argparse from collections import (Counter, OrderedDict) import time - +from timeit import default_timer as timer +import random +import string +from matplotlib import pyplot as plt import torch from torchtext.experimental.datasets import DATASETS from torchtext.experimental.vocab import ( @@ -16,6 +19,42 @@ from torchtext.experimental.transforms import basic_english_normalize +def compare_legacy_and_experimental_batch_lookup(): + num_tokens = 1000 + num_letters = 6 + num_lines = 100000 + vocab = [''.join(random.sample(string.ascii_letters * num_letters, num_letters)) for _ in range(num_tokens)] + counter = Counter() + counter.update(vocab) + legacy_vocab = Vocab(counter) + experimental_vocab = VocabExperimental(counter) + speed_ups = [] + token_lengths = [i for i in range(2, 100)] + for i in token_lengths: + lines = [random.sample(vocab, i) for _ in range(num_lines)] + start_time = timer() + for l in lines: + legacy_vocab.lookup_indices(l) + legacy_time = timer() - start_time + + start_time = timer() + for l in lines: + experimental_vocab.lookup_indices(l) + + experimental_time = timer() - start_time + + speed_ups.append(legacy_time / experimental_time) + print("speed-up={} for average length={}".format(legacy_time / experimental_time, i)) + del lines + + plt.close() + fig, ax = plt.subplots(1,1) + ax.plot(token_lengths, speed_ups) + ax.set_xlabel('Average Tokens per line') + ax.set_ylabel('Speed-up') + plt.savefig("speedup.jpg") + + def legacy_vocab_from_file_object(file_like_object, **kwargs): r"""Create a `Vocab` object from a file like object. @@ -76,7 +115,7 @@ def benchmark_experimental_vocab_construction(vocab_file_path, is_raw_text=True, print("Construction time:", time.monotonic() - t0) -def benchmark_experimental_vocab_lookup(vocab_file_path=None, dataset = 'AG_NEWS'): +def benchmark_experimental_vocab_lookup(vocab_file_path=None, dataset='AG_NEWS'): def _run_benchmark_lookup(tokens, vocab): t0 = time.monotonic() # list lookup