Skip to content

Commit 2cebac3

Browse files
authored
Remove TorchText legacy folder and its associated tests (#1437)
* Remove TorchText legacy folder and its associated tests * Fix Flake Error
1 parent f298494 commit 2cebac3

Some content is hidden

Large Commits have some content hidden by default. Use the searchbox below for content that may be hidden.

43 files changed

+54
-7184
lines changed

benchmark/benchmark_vocab.py

Lines changed: 13 additions & 121 deletions
Original file line numberDiff line numberDiff line change
@@ -1,10 +1,6 @@
11
import argparse
22
from collections import (Counter, OrderedDict)
33
import time
4-
import random
5-
import string
6-
from timeit import default_timer as timer
7-
from matplotlib import pyplot as plt
84
import torch
95
from torchtext.datasets import DATASETS
106
from torchtext.experimental.vocab_factory import (
@@ -13,15 +9,12 @@
139
)
1410
from torchtext.vocab import build_vocab_from_iterator
1511
from torchtext.vocab import vocab as VocabNew
16-
from torchtext.legacy.vocab import (
17-
Vocab,
18-
build_vocab_from_iterator as build_vocab_from_iterator_legacy,
19-
)
20-
from torchtext.experimental.transforms import(
12+
from torchtext.experimental.transforms import (
2113
basic_english_normalize,
2214
)
2315
from torchtext.data.utils import get_tokenizer
2416

17+
2518
def build_vocab(data, transforms):
2619
def apply_transforms(data):
2720
for _, line in data:
@@ -31,96 +24,16 @@ def apply_transforms(data):
3124
return vocab
3225

3326

34-
def compare_legacy_and_new_batch_lookup():
35-
num_tokens = 1000
36-
num_letters = 6
37-
num_lines = 100000
38-
vocab = [''.join(random.sample(string.ascii_letters * num_letters, num_letters)) for _ in range(num_tokens)]
39-
counter = Counter()
40-
counter.update(vocab)
41-
legacy_vocab = Vocab(counter)
42-
new_vocab = VocabNew(counter)
43-
speed_ups = []
44-
token_lengths = [i for i in range(2, 100)]
45-
for i in token_lengths:
46-
lines = [random.sample(vocab, i) for _ in range(num_lines)]
47-
start_time = timer()
48-
for text in lines:
49-
legacy_vocab.lookup_indices(text)
50-
legacy_time = timer() - start_time
51-
52-
start_time = timer()
53-
for text in lines:
54-
new_vocab.lookup_indices(text)
55-
56-
new_time = timer() - start_time
57-
58-
speed_ups.append(legacy_time / new_time)
59-
print("speed-up={} for average length={}".format(legacy_time / new_time, i))
60-
del lines
61-
62-
plt.close()
63-
fig, ax = plt.subplots(1, 1)
64-
ax.plot(token_lengths, speed_ups)
65-
ax.set_xlabel('Average Tokens per line')
66-
ax.set_ylabel('Speed-up')
67-
plt.savefig("speedup.jpg")
68-
69-
70-
def legacy_vocab_from_file_object(file_like_object, **kwargs):
71-
r"""Create a `Vocab` object from a file like object.
72-
73-
The `file_like_object` should contain tokens seperated by new lines. Note that the vocab
74-
will be created in the order that the tokens first appear in the file (and not by the frequency of tokens).
75-
76-
Format for txt file:
77-
token1
78-
token2
79-
...
80-
token_n
81-
82-
Args:
83-
file_like_object (FileObject): a file like object to read data from.
84-
Remaining keyword arguments: Passed to the constructor of Vocab class.
85-
86-
Returns:
87-
Vocab: a `Vocab` object.
88-
89-
Examples:
90-
>>> from torchtext.vocab import vocab_from_file_object
91-
>>> f = open('vocab.txt', 'r')
92-
>>> v = vocab_from_file_object(f, specials=('<unk>', '<pad>', '<eos>'), specials_first=False)
93-
"""
94-
tokenizer = basic_english_normalize()
95-
96-
def tokenize(line):
97-
return tokenizer(line)
98-
99-
def token_iterator(lines):
100-
for line in lines:
101-
for token in tokenize(line):
102-
yield token
103-
104-
return build_vocab_from_iterator_legacy(token_iterator(file_like_object))
105-
106-
107-
def benchmark_new_vocab_construction(vocab_file_path, is_raw_text=True, is_legacy=True, num_iters=1):
27+
def benchmark_new_vocab_construction(vocab_file_path, is_raw_text=True, num_iters=1):
10828
f = open(vocab_file_path, 'r')
10929
t0 = time.monotonic()
11030
if is_raw_text:
111-
if is_legacy:
112-
print("Loading from raw text file with legacy python function")
113-
for _ in range(num_iters):
114-
legacy_vocab_from_file_object(f)
115-
116-
print("Construction time:", time.monotonic() - t0)
117-
else:
118-
print("Loading from raw text file with basic_english_normalize tokenizer")
119-
for _ in range(num_iters):
120-
tokenizer = basic_english_normalize()
121-
jited_tokenizer = torch.jit.script(tokenizer)
122-
build_vocab_from_text_file(vocab_file_path, jited_tokenizer, num_cpus=1)
123-
print("Construction time:", time.monotonic() - t0)
31+
print("Loading from raw text file with basic_english_normalize tokenizer")
32+
for _ in range(num_iters):
33+
tokenizer = basic_english_normalize()
34+
jited_tokenizer = torch.jit.script(tokenizer)
35+
build_vocab_from_text_file(vocab_file_path, jited_tokenizer, num_cpus=1)
36+
print("Construction time:", time.monotonic() - t0)
12437
else:
12538
for _ in range(num_iters):
12639
load_vocab_from_file(f)
@@ -146,9 +59,9 @@ def _run_benchmark_lookup(tokens, vocab):
14659
tokens_lists = []
14760
tokenizer = get_tokenizer("basic_english")
14861
for (_, text) in DATASETS[dataset](split='train'):
149-
cur_tokens = tokenizer(text)
150-
tokens_lists.append(cur_tokens)
151-
tokens += cur_tokens
62+
cur_tokens = tokenizer(text)
63+
tokens_lists.append(cur_tokens)
64+
tokens += cur_tokens
15265

15366
if vocab_file_path:
15467
print("Loading Vocab from file {}".format(vocab_file_path))
@@ -158,12 +71,6 @@ def token_iterator(file_path):
15871
for token in f:
15972
yield token
16073

161-
# existing Vocab construction
162-
print("Vocab")
163-
t0 = time.monotonic()
164-
v_existing = build_vocab_from_iterator_legacy(token_iterator(vocab_file_path))
165-
print("Construction time:", time.monotonic() - t0)
166-
16774
# new Vocab construction
16875
print("Vocab New")
16976
t0 = time.monotonic()
@@ -176,25 +83,13 @@ def token_iterator(file_path):
17683
sorted_by_freq_tuples = sorted(counter.items(), key=lambda x: x[1], reverse=True)
17784
ordered_dict = OrderedDict(sorted_by_freq_tuples)
17885

179-
# existing Vocab construction
180-
print("Vocab")
181-
t0 = time.monotonic()
182-
v_existing = Vocab(counter)
183-
print("Construction time:", time.monotonic() - t0)
184-
18586
# new Vocab construction
18687
print("Vocab New")
18788
t0 = time.monotonic()
18889
v_new = VocabNew(ordered_dict)
18990
print("Construction time:", time.monotonic() - t0)
19091
jit_v_new = torch.jit.script(v_new)
19192

192-
# existing Vocab eager lookup
193-
print("Vocab - Eager Mode")
194-
_run_benchmark_lookup(tokens, v_existing)
195-
_run_benchmark_lookup([tokens], v_existing)
196-
_run_benchmark_lookup(tokens_lists, v_existing)
197-
19893
# new Vocab eager lookup
19994
print("Vocab New - Eager Mode")
20095
_run_benchmark_lookup(tokens, v_new)
@@ -215,8 +110,6 @@ def token_iterator(file_path):
215110
help='run benchmark for constructing a vocab (default=False)')
216111
parser.add_argument('--is-raw-text', type=bool, default=True,
217112
help='construct vocab from raw text file (default=True)')
218-
parser.add_argument('--is-legacy', type=bool, default=False,
219-
help='construct vocab using legacy implementation (default=False)')
220113
parser.add_argument('--vocab-filename-construction', type=str, default='vocab.txt',
221114
help='The name of vocab file used for construction')
222115
parser.add_argument('--vocab-filename-lookup', type=str, default=None,
@@ -226,8 +119,7 @@ def token_iterator(file_path):
226119
args = parser.parse_args()
227120

228121
if args.run_construction_benchmark:
229-
print("is_legacy", args.is_legacy)
230122
benchmark_new_vocab_construction(args.vocab_filename_construction,
231-
is_raw_text=args.is_raw_text, is_legacy=args.is_legacy)
123+
is_raw_text=args.is_raw_text)
232124
else:
233125
benchmark_new_vocab_lookup(args.vocab_filename_lookup, args.dataset)

0 commit comments

Comments
 (0)