1
1
import argparse
2
2
from collections import (Counter , OrderedDict )
3
3
import time
4
- import random
5
- import string
6
- from timeit import default_timer as timer
7
- from matplotlib import pyplot as plt
8
4
import torch
9
5
from torchtext .datasets import DATASETS
10
6
from torchtext .experimental .vocab_factory import (
13
9
)
14
10
from torchtext .vocab import build_vocab_from_iterator
15
11
from torchtext .vocab import vocab as VocabNew
16
- from torchtext .legacy .vocab import (
17
- Vocab ,
18
- build_vocab_from_iterator as build_vocab_from_iterator_legacy ,
19
- )
20
- from torchtext .experimental .transforms import (
12
+ from torchtext .experimental .transforms import (
21
13
basic_english_normalize ,
22
14
)
23
15
from torchtext .data .utils import get_tokenizer
24
16
17
+
25
18
def build_vocab (data , transforms ):
26
19
def apply_transforms (data ):
27
20
for _ , line in data :
@@ -31,96 +24,16 @@ def apply_transforms(data):
31
24
return vocab
32
25
33
26
34
- def compare_legacy_and_new_batch_lookup ():
35
- num_tokens = 1000
36
- num_letters = 6
37
- num_lines = 100000
38
- vocab = ['' .join (random .sample (string .ascii_letters * num_letters , num_letters )) for _ in range (num_tokens )]
39
- counter = Counter ()
40
- counter .update (vocab )
41
- legacy_vocab = Vocab (counter )
42
- new_vocab = VocabNew (counter )
43
- speed_ups = []
44
- token_lengths = [i for i in range (2 , 100 )]
45
- for i in token_lengths :
46
- lines = [random .sample (vocab , i ) for _ in range (num_lines )]
47
- start_time = timer ()
48
- for text in lines :
49
- legacy_vocab .lookup_indices (text )
50
- legacy_time = timer () - start_time
51
-
52
- start_time = timer ()
53
- for text in lines :
54
- new_vocab .lookup_indices (text )
55
-
56
- new_time = timer () - start_time
57
-
58
- speed_ups .append (legacy_time / new_time )
59
- print ("speed-up={} for average length={}" .format (legacy_time / new_time , i ))
60
- del lines
61
-
62
- plt .close ()
63
- fig , ax = plt .subplots (1 , 1 )
64
- ax .plot (token_lengths , speed_ups )
65
- ax .set_xlabel ('Average Tokens per line' )
66
- ax .set_ylabel ('Speed-up' )
67
- plt .savefig ("speedup.jpg" )
68
-
69
-
70
- def legacy_vocab_from_file_object (file_like_object , ** kwargs ):
71
- r"""Create a `Vocab` object from a file like object.
72
-
73
- The `file_like_object` should contain tokens seperated by new lines. Note that the vocab
74
- will be created in the order that the tokens first appear in the file (and not by the frequency of tokens).
75
-
76
- Format for txt file:
77
- token1
78
- token2
79
- ...
80
- token_n
81
-
82
- Args:
83
- file_like_object (FileObject): a file like object to read data from.
84
- Remaining keyword arguments: Passed to the constructor of Vocab class.
85
-
86
- Returns:
87
- Vocab: a `Vocab` object.
88
-
89
- Examples:
90
- >>> from torchtext.vocab import vocab_from_file_object
91
- >>> f = open('vocab.txt', 'r')
92
- >>> v = vocab_from_file_object(f, specials=('<unk>', '<pad>', '<eos>'), specials_first=False)
93
- """
94
- tokenizer = basic_english_normalize ()
95
-
96
- def tokenize (line ):
97
- return tokenizer (line )
98
-
99
- def token_iterator (lines ):
100
- for line in lines :
101
- for token in tokenize (line ):
102
- yield token
103
-
104
- return build_vocab_from_iterator_legacy (token_iterator (file_like_object ))
105
-
106
-
107
- def benchmark_new_vocab_construction (vocab_file_path , is_raw_text = True , is_legacy = True , num_iters = 1 ):
27
+ def benchmark_new_vocab_construction (vocab_file_path , is_raw_text = True , num_iters = 1 ):
108
28
f = open (vocab_file_path , 'r' )
109
29
t0 = time .monotonic ()
110
30
if is_raw_text :
111
- if is_legacy :
112
- print ("Loading from raw text file with legacy python function" )
113
- for _ in range (num_iters ):
114
- legacy_vocab_from_file_object (f )
115
-
116
- print ("Construction time:" , time .monotonic () - t0 )
117
- else :
118
- print ("Loading from raw text file with basic_english_normalize tokenizer" )
119
- for _ in range (num_iters ):
120
- tokenizer = basic_english_normalize ()
121
- jited_tokenizer = torch .jit .script (tokenizer )
122
- build_vocab_from_text_file (vocab_file_path , jited_tokenizer , num_cpus = 1 )
123
- print ("Construction time:" , time .monotonic () - t0 )
31
+ print ("Loading from raw text file with basic_english_normalize tokenizer" )
32
+ for _ in range (num_iters ):
33
+ tokenizer = basic_english_normalize ()
34
+ jited_tokenizer = torch .jit .script (tokenizer )
35
+ build_vocab_from_text_file (vocab_file_path , jited_tokenizer , num_cpus = 1 )
36
+ print ("Construction time:" , time .monotonic () - t0 )
124
37
else :
125
38
for _ in range (num_iters ):
126
39
load_vocab_from_file (f )
@@ -146,9 +59,9 @@ def _run_benchmark_lookup(tokens, vocab):
146
59
tokens_lists = []
147
60
tokenizer = get_tokenizer ("basic_english" )
148
61
for (_ , text ) in DATASETS [dataset ](split = 'train' ):
149
- cur_tokens = tokenizer (text )
150
- tokens_lists .append (cur_tokens )
151
- tokens += cur_tokens
62
+ cur_tokens = tokenizer (text )
63
+ tokens_lists .append (cur_tokens )
64
+ tokens += cur_tokens
152
65
153
66
if vocab_file_path :
154
67
print ("Loading Vocab from file {}" .format (vocab_file_path ))
@@ -158,12 +71,6 @@ def token_iterator(file_path):
158
71
for token in f :
159
72
yield token
160
73
161
- # existing Vocab construction
162
- print ("Vocab" )
163
- t0 = time .monotonic ()
164
- v_existing = build_vocab_from_iterator_legacy (token_iterator (vocab_file_path ))
165
- print ("Construction time:" , time .monotonic () - t0 )
166
-
167
74
# new Vocab construction
168
75
print ("Vocab New" )
169
76
t0 = time .monotonic ()
@@ -176,25 +83,13 @@ def token_iterator(file_path):
176
83
sorted_by_freq_tuples = sorted (counter .items (), key = lambda x : x [1 ], reverse = True )
177
84
ordered_dict = OrderedDict (sorted_by_freq_tuples )
178
85
179
- # existing Vocab construction
180
- print ("Vocab" )
181
- t0 = time .monotonic ()
182
- v_existing = Vocab (counter )
183
- print ("Construction time:" , time .monotonic () - t0 )
184
-
185
86
# new Vocab construction
186
87
print ("Vocab New" )
187
88
t0 = time .monotonic ()
188
89
v_new = VocabNew (ordered_dict )
189
90
print ("Construction time:" , time .monotonic () - t0 )
190
91
jit_v_new = torch .jit .script (v_new )
191
92
192
- # existing Vocab eager lookup
193
- print ("Vocab - Eager Mode" )
194
- _run_benchmark_lookup (tokens , v_existing )
195
- _run_benchmark_lookup ([tokens ], v_existing )
196
- _run_benchmark_lookup (tokens_lists , v_existing )
197
-
198
93
# new Vocab eager lookup
199
94
print ("Vocab New - Eager Mode" )
200
95
_run_benchmark_lookup (tokens , v_new )
@@ -215,8 +110,6 @@ def token_iterator(file_path):
215
110
help = 'run benchmark for constructing a vocab (default=False)' )
216
111
parser .add_argument ('--is-raw-text' , type = bool , default = True ,
217
112
help = 'construct vocab from raw text file (default=True)' )
218
- parser .add_argument ('--is-legacy' , type = bool , default = False ,
219
- help = 'construct vocab using legacy implementation (default=False)' )
220
113
parser .add_argument ('--vocab-filename-construction' , type = str , default = 'vocab.txt' ,
221
114
help = 'The name of vocab file used for construction' )
222
115
parser .add_argument ('--vocab-filename-lookup' , type = str , default = None ,
@@ -226,8 +119,7 @@ def token_iterator(file_path):
226
119
args = parser .parse_args ()
227
120
228
121
if args .run_construction_benchmark :
229
- print ("is_legacy" , args .is_legacy )
230
122
benchmark_new_vocab_construction (args .vocab_filename_construction ,
231
- is_raw_text = args .is_raw_text , is_legacy = args . is_legacy )
123
+ is_raw_text = args .is_raw_text )
232
124
else :
233
125
benchmark_new_vocab_lookup (args .vocab_filename_lookup , args .dataset )
0 commit comments