Skip to content

Example docs #576

New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Merged
merged 22 commits into from
Jul 31, 2019
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
5 changes: 5 additions & 0 deletions docs/source/data.rst
Original file line number Diff line number Diff line change
Expand Up @@ -137,6 +137,11 @@ Functions

.. autofunction:: get_tokenizer

:hidden:`ngrams_iterator`
~~~~~~~~~~~~~~~~~~~~~~~

.. autofunction:: ngrams_iterator

:hidden:`interleave_keys`
~~~~~~~~~~~~~~~~~~~~~~~~~

Expand Down
9 changes: 5 additions & 4 deletions docs/source/vocab.rst
Original file line number Diff line number Diff line change
Expand Up @@ -55,12 +55,13 @@ Pretrained Word Embeddings
Misc.
-----

:hidden:`_default_unk_index`
~~~~~~~~~~~~~~~~~~~~~~~~~~~~
:hidden:`build_vocab_from_iterator`
~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~

.. autofunction:: _default_unk_index
.. autodata:: build_vocab_from_iterator

:hidden:`pretrained_aliases`
~~~~~~~~~~~~~~~~~~~~~~~~~~~~

.. autodata:: pretrained_aliases
.. autodata:: pretrained_aliases

18 changes: 14 additions & 4 deletions examples/text_classification/create_datasets.py
Original file line number Diff line number Diff line change
Expand Up @@ -6,14 +6,24 @@

from torchtext.datasets import text_classification

r"""
Once you have the datasets, you can save them as a list of tensors
and load later on in other projects. Here is an example to load/save
text_classification datasets.
"""

if __name__ == "__main__":
parser = argparse.ArgumentParser(description=(
'Create list of Tensors for training and '
'testing based on given datasets'))
parser.add_argument('dataset', choices=text_classification.DATASETS)
parser.add_argument('--logging-level', default='WARNING')
parser.add_argument('--ngrams', type=int, default=2)
parser.add_argument('--root', default='.data')
parser.add_argument('dataset', choices=text_classification.DATASETS,
help='dataset name')
parser.add_argument('--logging-level', default='WARNING',
help='logging level (default=WARNING)')
parser.add_argument('--ngrams', type=int, default=2,
help='ngrams (default=2)')
parser.add_argument('--root', default='.data',
help='data directory (default=.data)')
args = parser.parse_args()

logging.basicConfig(level=getattr(logging, args.logging_level))
Expand Down
116 changes: 101 additions & 15 deletions examples/text_classification/iterable_train.py
Original file line number Diff line number Diff line change
Expand Up @@ -15,8 +15,32 @@

from tqdm import tqdm

r"""
This example shows how to build an iterable dataset from the iterator. The
get_csv_iterator() function is used to read CSV file for the data. An abstract
dataset class setups the iterators for training the model.
"""


def generate_batch(batch):
"""
Since the text entries have different lengths, a custom function
generate_batch() is used to generate data batches and offsets,
which are compatible with EmbeddingBag. The function is passed
to 'collate_fn' in torch.utils.data.DataLoader. The input to
'collate_fn' is a list of tensors with the size of batch_size,
and the 'collate_fn' function packs them into a mini-batch.
Pay attention here and make sure that 'collate_fn' is declared
as a top level def. This ensures that the function is available
in each worker.
Output:
text: the text entries in the data_batch are packed into a list and
concatenated as a single tensor for the input of nn.EmbeddingBag.
offsets: the offsets is a tensor of delimiters to represent the beginning
index of the individual sequence in the text tensor.
label: a tensor saving the labels of individual text entries.
"""

label = torch.tensor([entry[0] for entry in batch])
text = [entry[1] for entry in batch]
offsets = [0] + [len(entry) for entry in text]
Expand All @@ -25,15 +49,30 @@ def generate_batch(batch):
return text, offsets, label


r"""
torch.utils.data.DataLoader is recommended for PyTorch users to load data.
We use DataLoader here to load datasets and send it to the train()
and text() functions.
"""


def train(lr_, num_epoch, data_):
r"""
Here we use SGD optimizer to train the model.

Arguments:
lr_: learning rate
num_epoch: the number of epoches for training the model
data_: the data used to train the model
"""
data = DataLoader(
data_,
batch_size=batch_size,
collate_fn=generate_batch,
num_workers=args.num_workers,
pin_memory=True)
optimizer = torch.optim.SGD(model.parameters(), lr=lr_)
scheduler = torch.optim.lr_scheduler.ExponentialLR(optimizer, args.lr_gamma)
scheduler = torch.optim.lr_scheduler.StepLR(optimizer, 1, gamma=args.lr_gamma)
with tqdm(unit_scale=0, unit='lines', total=train_num_lines * num_epochs) as t:
avg_loss = 0.0
for i, (text, offsets, label) in enumerate(data):
Expand All @@ -55,6 +94,10 @@ def train(lr_, num_epoch, data_):


def test(data_):
r"""
Arguments:
data_: the data used to train the model
"""
data = DataLoader(
data_,
batch_size=batch_size,
Expand All @@ -72,6 +115,19 @@ def test(data_):


def get_csv_iterator(data_path, ngrams, vocab, start=0, num_lines=None):
r"""
Generate an iterator to read CSV file.
The yield values are an integer for the label and a tensor for the text part.

Arguments:
data_path: a path for the data file.
ngrams: the number used for ngrams.
vocab: a vocab object saving the string-to-index information
start: the starting line to read (Default: 0). This is useful for
on-fly multi-processing data loading.
num_lines: the number of lines read by the iterator (Default: None).

"""
def iterator(start, num_lines):
tokenizer = get_tokenizer("basic_english")
with io.open(data_path, encoding="utf8") as f:
Expand All @@ -93,6 +149,15 @@ def iterator(start, num_lines):


class Dataset(torch.utils.data.IterableDataset):
r"""
An iterable dataset to save the data. This dataset supports multi-processing
to load the data.

Arguments:
iterator: the iterator to read data.
num_lines: the number of lines read by the individual iterator.
num_epochs: the numerber of epochs.
"""
def __init__(self, iterator, num_lines, num_epochs):
super(Dataset, self).__init__()
self._num_lines = num_lines
Expand All @@ -101,6 +166,13 @@ def __init__(self, iterator, num_lines, num_epochs):
self._setup = False

def _setup_iterator(self):
r"""
_setup_iterator() function assign the starting line and the number
of lines to read for the individual worker. Then, send them to the iterator
to load the data.

If worker info is not avaialble, it will read all the lines across epochs.
"""
worker_info = torch.utils.data.get_worker_info()
if worker_info:
chunk = int(self._num_lines / worker_info.num_workers)
Expand Down Expand Up @@ -128,6 +200,9 @@ def __iter__(self):


def count(data_path):
r"""
return the total numerber of text entries and labels.
"""
with io.open(data_path, encoding="utf8") as f:
reader = unicode_csv_reader(f)
labels = [int(row[0]) for row in reader]
Expand All @@ -139,20 +214,31 @@ def count(data_path):
if __name__ == "__main__":
parser = argparse.ArgumentParser(
description='Train a text classification model on text classification datasets.')
parser.add_argument('train_data_path')
parser.add_argument('test_data_path')
parser.add_argument('vocab')
parser.add_argument('--num-epochs', type=int, default=3)
parser.add_argument('--embed-dim', type=int, default=128)
parser.add_argument('--batch-size', type=int, default=64)
parser.add_argument('--lr', type=float, default=64.0)
parser.add_argument('--lr-gamma', type=float, default=0.999)
parser.add_argument('--ngrams', type=int, default=2)
parser.add_argument('--num-workers', type=int, default=1)
parser.add_argument('--device', default='cpu')
parser.add_argument('--data', default='.data')
parser.add_argument('--save-model-path')
parser.add_argument('--logging-level', default='WARNING')
parser.add_argument('train_data_path', help='path for train data')
parser.add_argument('test_data_path', help='path for test data')
parser.add_argument('vocab', help='path for vocab object')
parser.add_argument('--num-epochs', type=int, default=3,
help='num epochs (default=3)')
parser.add_argument('--embed-dim', type=int, default=128,
help='embed dim. (default=128)')
parser.add_argument('--batch-size', type=int, default=64,
help='batch size (default=64)')
parser.add_argument('--lr', type=float, default=4.0,
help='learning rate (default=4.0)')
parser.add_argument('--lr-gamma', type=float, default=0.8,
help='gamma value for lr (default=0.8)')
parser.add_argument('--ngrams', type=int, default=2,
help='ngrams (default=2)')
parser.add_argument('--num-workers', type=int, default=1,
help='num of workers (default=1)')
parser.add_argument('--device', default='cpu',
help='device (default=cpu)')
parser.add_argument('--data', default='.data',
help='data directory (default=.data)')
parser.add_argument('--save-model-path',
help='path for saving model')
parser.add_argument('--logging-level', default='WARNING',
help='logging level (default=WARNING)')
args = parser.parse_args()

num_epochs = args.num_epochs
Expand Down
22 changes: 22 additions & 0 deletions examples/text_classification/model.py
Original file line number Diff line number Diff line change
@@ -1,5 +1,20 @@
import torch.nn as nn

r"""
The model is composed of the embeddingbag layer and the linear layer.

nn.EmbeddingBag computes the mean of 'bags' of embeddings. The text
entries here have different lengths. nn.EmbeddingBag requires no
padding because the lengths of sentences are saved in offsets.
Therefore, this method is much faster than the original one
with TorchText Iterator and Batch.

Additionally, since it accumulates the average across the embeddings on the fly,
nn.EmbeddingBag can enhance the performance and memory efficiency
to process a sequence of tensors.

"""


class TextSentiment(nn.Module):
def __init__(self, vocab_size, embed_dim, num_class):
Expand All @@ -15,4 +30,11 @@ def init_weights(self):
self.fc.bias.data.zero_()

def forward(self, text, offsets):
r"""
Arguments:
text: 1-D tensor representing a bag of text tensors
offsets: a list of offsets to delimit the 1-D text tensor
into the individual sequences.

"""
return self.fc(self.embedding(text, offsets))
28 changes: 21 additions & 7 deletions examples/text_classification/predict.py
Original file line number Diff line number Diff line change
@@ -1,26 +1,40 @@
import torch
import sys
import argparse
from torchtext.data.utils import get_tokenizer
from torchtext.data.utils import ngrams_iterator

from torchtext.datasets.text_classification import text_normalize

def predict(text, model, dictionary, ngrams):
r"""
The predict() function here is used to test the model on a sample text.
The input text is numericalized with the vocab and then sent to
the model for inference.

def predict(text, model, dictionary):
Arguments:
text: a sample text string
model: the trained model
dictionary: a vocab object for the information of string-to-index
ngrams: the number of ngrams.
"""
tokenizer = get_tokenizer("basic_english")
with torch.no_grad():
text = torch.tensor([dictionary.get(token, dictionary['<unk>'])
for token in text_normalize(text)])
text = torch.tensor([dictionary[token]
for token in ngrams_iterator(tokenizer(text), ngrams)])
output = model(text, torch.tensor([0]))
return output.argmax(1).item() + 1


if __name__ == "__main__":
parser = argparse.ArgumentParser(
description='Predict text from stdin given model and dictionary')
parser.add_argument('model')
parser.add_argument('dictionary')
parser.add_argument('model', help='the path for model')
parser.add_argument('dictionary', help='the path for dictionary')
parser.add_argument('--ngrams', type=int, default=2,
help='ngrams (default=2)')
args = parser.parse_args()

model = torch.load(args.model)
dictionary = torch.load(args.dictionary)
for line in sys.stdin:
print(predict(line, model, dictionary))
print(predict(line, model, dictionary, args.ngrams))
Loading