Skip to content

Remove TorchText legacy folder and its associated tests #1437

New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Merged
merged 2 commits into from
Nov 10, 2021
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
134 changes: 13 additions & 121 deletions benchmark/benchmark_vocab.py
Original file line number Diff line number Diff line change
@@ -1,10 +1,6 @@
import argparse
from collections import (Counter, OrderedDict)
import time
import random
import string
from timeit import default_timer as timer
from matplotlib import pyplot as plt
import torch
from torchtext.datasets import DATASETS
from torchtext.experimental.vocab_factory import (
Expand All @@ -13,15 +9,12 @@
)
from torchtext.vocab import build_vocab_from_iterator
from torchtext.vocab import vocab as VocabNew
from torchtext.legacy.vocab import (
Vocab,
build_vocab_from_iterator as build_vocab_from_iterator_legacy,
)
from torchtext.experimental.transforms import(
from torchtext.experimental.transforms import (
basic_english_normalize,
)
from torchtext.data.utils import get_tokenizer


def build_vocab(data, transforms):
def apply_transforms(data):
for _, line in data:
Expand All @@ -31,96 +24,16 @@ def apply_transforms(data):
return vocab


def compare_legacy_and_new_batch_lookup():
num_tokens = 1000
num_letters = 6
num_lines = 100000
vocab = [''.join(random.sample(string.ascii_letters * num_letters, num_letters)) for _ in range(num_tokens)]
counter = Counter()
counter.update(vocab)
legacy_vocab = Vocab(counter)
new_vocab = VocabNew(counter)
speed_ups = []
token_lengths = [i for i in range(2, 100)]
for i in token_lengths:
lines = [random.sample(vocab, i) for _ in range(num_lines)]
start_time = timer()
for text in lines:
legacy_vocab.lookup_indices(text)
legacy_time = timer() - start_time

start_time = timer()
for text in lines:
new_vocab.lookup_indices(text)

new_time = timer() - start_time

speed_ups.append(legacy_time / new_time)
print("speed-up={} for average length={}".format(legacy_time / new_time, i))
del lines

plt.close()
fig, ax = plt.subplots(1, 1)
ax.plot(token_lengths, speed_ups)
ax.set_xlabel('Average Tokens per line')
ax.set_ylabel('Speed-up')
plt.savefig("speedup.jpg")


def legacy_vocab_from_file_object(file_like_object, **kwargs):
r"""Create a `Vocab` object from a file like object.

The `file_like_object` should contain tokens seperated by new lines. Note that the vocab
will be created in the order that the tokens first appear in the file (and not by the frequency of tokens).

Format for txt file:
token1
token2
...
token_n

Args:
file_like_object (FileObject): a file like object to read data from.
Remaining keyword arguments: Passed to the constructor of Vocab class.

Returns:
Vocab: a `Vocab` object.

Examples:
>>> from torchtext.vocab import vocab_from_file_object
>>> f = open('vocab.txt', 'r')
>>> v = vocab_from_file_object(f, specials=('<unk>', '<pad>', '<eos>'), specials_first=False)
"""
tokenizer = basic_english_normalize()

def tokenize(line):
return tokenizer(line)

def token_iterator(lines):
for line in lines:
for token in tokenize(line):
yield token

return build_vocab_from_iterator_legacy(token_iterator(file_like_object))


def benchmark_new_vocab_construction(vocab_file_path, is_raw_text=True, is_legacy=True, num_iters=1):
def benchmark_new_vocab_construction(vocab_file_path, is_raw_text=True, num_iters=1):
f = open(vocab_file_path, 'r')
t0 = time.monotonic()
if is_raw_text:
if is_legacy:
print("Loading from raw text file with legacy python function")
for _ in range(num_iters):
legacy_vocab_from_file_object(f)

print("Construction time:", time.monotonic() - t0)
else:
print("Loading from raw text file with basic_english_normalize tokenizer")
for _ in range(num_iters):
tokenizer = basic_english_normalize()
jited_tokenizer = torch.jit.script(tokenizer)
build_vocab_from_text_file(vocab_file_path, jited_tokenizer, num_cpus=1)
print("Construction time:", time.monotonic() - t0)
print("Loading from raw text file with basic_english_normalize tokenizer")
for _ in range(num_iters):
tokenizer = basic_english_normalize()
jited_tokenizer = torch.jit.script(tokenizer)
build_vocab_from_text_file(vocab_file_path, jited_tokenizer, num_cpus=1)
print("Construction time:", time.monotonic() - t0)
else:
for _ in range(num_iters):
load_vocab_from_file(f)
Expand All @@ -146,9 +59,9 @@ def _run_benchmark_lookup(tokens, vocab):
tokens_lists = []
tokenizer = get_tokenizer("basic_english")
for (_, text) in DATASETS[dataset](split='train'):
cur_tokens = tokenizer(text)
tokens_lists.append(cur_tokens)
tokens += cur_tokens
cur_tokens = tokenizer(text)
tokens_lists.append(cur_tokens)
tokens += cur_tokens

if vocab_file_path:
print("Loading Vocab from file {}".format(vocab_file_path))
Expand All @@ -158,12 +71,6 @@ def token_iterator(file_path):
for token in f:
yield token

# existing Vocab construction
print("Vocab")
t0 = time.monotonic()
v_existing = build_vocab_from_iterator_legacy(token_iterator(vocab_file_path))
print("Construction time:", time.monotonic() - t0)

# new Vocab construction
print("Vocab New")
t0 = time.monotonic()
Expand All @@ -176,25 +83,13 @@ def token_iterator(file_path):
sorted_by_freq_tuples = sorted(counter.items(), key=lambda x: x[1], reverse=True)
ordered_dict = OrderedDict(sorted_by_freq_tuples)

# existing Vocab construction
print("Vocab")
t0 = time.monotonic()
v_existing = Vocab(counter)
print("Construction time:", time.monotonic() - t0)

# new Vocab construction
print("Vocab New")
t0 = time.monotonic()
v_new = VocabNew(ordered_dict)
print("Construction time:", time.monotonic() - t0)
jit_v_new = torch.jit.script(v_new)

# existing Vocab eager lookup
print("Vocab - Eager Mode")
_run_benchmark_lookup(tokens, v_existing)
_run_benchmark_lookup([tokens], v_existing)
_run_benchmark_lookup(tokens_lists, v_existing)

# new Vocab eager lookup
print("Vocab New - Eager Mode")
_run_benchmark_lookup(tokens, v_new)
Expand All @@ -215,8 +110,6 @@ def token_iterator(file_path):
help='run benchmark for constructing a vocab (default=False)')
parser.add_argument('--is-raw-text', type=bool, default=True,
help='construct vocab from raw text file (default=True)')
parser.add_argument('--is-legacy', type=bool, default=False,
help='construct vocab using legacy implementation (default=False)')
parser.add_argument('--vocab-filename-construction', type=str, default='vocab.txt',
help='The name of vocab file used for construction')
parser.add_argument('--vocab-filename-lookup', type=str, default=None,
Expand All @@ -226,8 +119,7 @@ def token_iterator(file_path):
args = parser.parse_args()

if args.run_construction_benchmark:
print("is_legacy", args.is_legacy)
benchmark_new_vocab_construction(args.vocab_filename_construction,
is_raw_text=args.is_raw_text, is_legacy=args.is_legacy)
is_raw_text=args.is_raw_text)
else:
benchmark_new_vocab_lookup(args.vocab_filename_lookup, args.dataset)
Loading