pytorch · zhangguanheng66 · Jul 31, 2019 · Jul 30, 2019 · Jul 30, 2019 · Jul 30, 2019
diff --git a/docs/source/data.rst b/docs/source/data.rst
@@ -137,6 +137,11 @@ Functions
 
 .. autofunction:: get_tokenizer
 
+:hidden:`ngrams_iterator`
+~~~~~~~~~~~~~~~~~~~~~~~
+
+.. autofunction:: ngrams_iterator
+
 :hidden:`interleave_keys`
 ~~~~~~~~~~~~~~~~~~~~~~~~~
 

diff --git a/docs/source/vocab.rst b/docs/source/vocab.rst
@@ -55,12 +55,13 @@ Pretrained Word Embeddings
 Misc.
 -----
 
-:hidden:`_default_unk_index`
-~~~~~~~~~~~~~~~~~~~~~~~~~~~~
+:hidden:`build_vocab_from_iterator`
+~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
 
-.. autofunction:: _default_unk_index
+.. autodata:: build_vocab_from_iterator
 
 :hidden:`pretrained_aliases`
 ~~~~~~~~~~~~~~~~~~~~~~~~~~~~
 
-.. autodata:: pretrained_aliases
+.. autodata:: pretrained_aliases
+
diff --git a/examples/text_classification/create_datasets.py b/examples/text_classification/create_datasets.py
@@ -6,14 +6,24 @@
 
 from torchtext.datasets import text_classification
 
+r"""
+Once you have the datasets, you can save them as a list of tensors
+and load later on in other projects. Here is an example to load/save
+text_classification datasets.
+"""
+
 if __name__ == "__main__":
     parser = argparse.ArgumentParser(description=(
         'Create list of Tensors for training and '
         'testing based on given datasets'))
-    parser.add_argument('dataset', choices=text_classification.DATASETS)
-    parser.add_argument('--logging-level', default='WARNING')
-    parser.add_argument('--ngrams', type=int, default=2)
-    parser.add_argument('--root', default='.data')
+    parser.add_argument('dataset', choices=text_classification.DATASETS,
+                        help='dataset name')
+    parser.add_argument('--logging-level', default='WARNING',
+                        help='logging level (default=WARNING)')
+    parser.add_argument('--ngrams', type=int, default=2,
+                        help='ngrams (default=2)')
+    parser.add_argument('--root', default='.data',
+                        help='data directory (default=.data)')
     args = parser.parse_args()
 
     logging.basicConfig(level=getattr(logging, args.logging_level))

diff --git a/examples/text_classification/iterable_train.py b/examples/text_classification/iterable_train.py
@@ -15,8 +15,32 @@
 
 from tqdm import tqdm
 
+r"""
+This example shows how to build an iterable dataset from the iterator. The
+get_csv_iterator() function is used to read CSV file for the data. An abstract
+dataset class setups the iterators for training the model.
+"""
+
 
 def generate_batch(batch):
+    """
+    Since the text entries have different lengths, a custom function
+    generate_batch() is used to generate data batches and offsets,
+    which are compatible with EmbeddingBag. The function is passed
+    to 'collate_fn' in torch.utils.data.DataLoader. The input to
+    'collate_fn' is a list of tensors with the size of batch_size,
+    and the 'collate_fn' function packs them into a mini-batch.
+    Pay attention here and make sure that 'collate_fn' is declared
+    as a top level def. This ensures that the function is available
+    in each worker.
+    Output:
+        text: the text entries in the data_batch are packed into a list and
+            concatenated as a single tensor for the input of nn.EmbeddingBag.
+        offsets: the offsets is a tensor of delimiters to represent the beginning
+            index of the individual sequence in the text tensor.
+        label: a tensor saving the labels of individual text entries.
+    """
+
     label = torch.tensor([entry[0] for entry in batch])
     text = [entry[1] for entry in batch]
     offsets = [0] + [len(entry) for entry in text]
@@ -25,15 +49,30 @@ def generate_batch(batch):
     return text, offsets, label
 
 
+r"""
+torch.utils.data.DataLoader is recommended for PyTorch users to load data.
+We use DataLoader here to load datasets and send it to the train()
+and text() functions.
+"""
+
+
 def train(lr_, num_epoch, data_):
+    r"""
+    Here we use SGD optimizer to train the model.
+
+    Arguments:
+        lr_: learning rate
+        num_epoch: the number of epoches for training the model
+        data_: the data used to train the model
+    """
     data = DataLoader(
         data_,
         batch_size=batch_size,
         collate_fn=generate_batch,
         num_workers=args.num_workers,
         pin_memory=True)
     optimizer = torch.optim.SGD(model.parameters(), lr=lr_)
-    scheduler = torch.optim.lr_scheduler.ExponentialLR(optimizer, args.lr_gamma)
+    scheduler = torch.optim.lr_scheduler.StepLR(optimizer, 1, gamma=args.lr_gamma)
     with tqdm(unit_scale=0, unit='lines', total=train_num_lines * num_epochs) as t:
         avg_loss = 0.0
         for i, (text, offsets, label) in enumerate(data):
@@ -55,6 +94,10 @@ def train(lr_, num_epoch, data_):
 
 
 def test(data_):
+    r"""
+    Arguments:
+        data_: the data used to train the model
+    """
     data = DataLoader(
         data_,
         batch_size=batch_size,
@@ -72,6 +115,19 @@ def test(data_):
 
 
 def get_csv_iterator(data_path, ngrams, vocab, start=0, num_lines=None):
+    r"""
+    Generate an iterator to read CSV file.
+    The yield values are an integer for the label and a tensor for the text part.
+
+    Arguments:
+        data_path: a path for the data file.
+        ngrams: the number used for ngrams.
+        vocab: a vocab object saving the string-to-index information
+        start: the starting line to read (Default: 0). This is useful for
+            on-fly multi-processing data loading.
+        num_lines: the number of lines read by the iterator (Default: None).
+
+    """
     def iterator(start, num_lines):
         tokenizer = get_tokenizer("basic_english")
         with io.open(data_path, encoding="utf8") as f:
@@ -93,6 +149,15 @@ def iterator(start, num_lines):
 
 
 class Dataset(torch.utils.data.IterableDataset):
+    r"""
+    An iterable dataset to save the data. This dataset supports multi-processing
+    to load the data.
+
+    Arguments:
+        iterator: the iterator to read data.
+        num_lines: the number of lines read by the individual iterator.
+        num_epochs: the numerber of epochs.
+    """
     def __init__(self, iterator, num_lines, num_epochs):
         super(Dataset, self).__init__()
         self._num_lines = num_lines
@@ -101,6 +166,13 @@ def __init__(self, iterator, num_lines, num_epochs):
         self._setup = False
 
     def _setup_iterator(self):
+        r"""
+        _setup_iterator() function assign the starting line and the number
+        of lines to read for the individual worker. Then, send them to the iterator
+        to load the data.
+
+        If worker info is not avaialble, it will read all the lines across epochs.
+        """
         worker_info = torch.utils.data.get_worker_info()
         if worker_info:
             chunk = int(self._num_lines / worker_info.num_workers)
@@ -128,6 +200,9 @@ def __iter__(self):
 
 
 def count(data_path):
+    r"""
+    return the total numerber of text entries and labels.
+    """
     with io.open(data_path, encoding="utf8") as f:
         reader = unicode_csv_reader(f)
         labels = [int(row[0]) for row in reader]
@@ -139,20 +214,31 @@ def count(data_path):
 if __name__ == "__main__":
     parser = argparse.ArgumentParser(
         description='Train a text classification model on text classification datasets.')
-    parser.add_argument('train_data_path')
-    parser.add_argument('test_data_path')
-    parser.add_argument('vocab')
-    parser.add_argument('--num-epochs', type=int, default=3)
-    parser.add_argument('--embed-dim', type=int, default=128)
-    parser.add_argument('--batch-size', type=int, default=64)
-    parser.add_argument('--lr', type=float, default=64.0)
-    parser.add_argument('--lr-gamma', type=float, default=0.999)
-    parser.add_argument('--ngrams', type=int, default=2)
-    parser.add_argument('--num-workers', type=int, default=1)
-    parser.add_argument('--device', default='cpu')
-    parser.add_argument('--data', default='.data')
-    parser.add_argument('--save-model-path')
-    parser.add_argument('--logging-level', default='WARNING')
+    parser.add_argument('train_data_path', help='path for train data')
+    parser.add_argument('test_data_path', help='path for test data')
+    parser.add_argument('vocab', help='path for vocab object')
+    parser.add_argument('--num-epochs', type=int, default=3,
+                        help='num epochs (default=3)')
+    parser.add_argument('--embed-dim', type=int, default=128,
+                        help='embed dim. (default=128)')
+    parser.add_argument('--batch-size', type=int, default=64,
+                        help='batch size (default=64)')
+    parser.add_argument('--lr', type=float, default=4.0,
+                        help='learning rate (default=4.0)')
+    parser.add_argument('--lr-gamma', type=float, default=0.8,
+                        help='gamma value for lr (default=0.8)')
+    parser.add_argument('--ngrams', type=int, default=2,
+                        help='ngrams (default=2)')
+    parser.add_argument('--num-workers', type=int, default=1,
+                        help='num of workers (default=1)')
+    parser.add_argument('--device', default='cpu',
+                        help='device (default=cpu)')
+    parser.add_argument('--data', default='.data',
+                        help='data directory (default=.data)')
+    parser.add_argument('--save-model-path',
+                        help='path for saving model')
+    parser.add_argument('--logging-level', default='WARNING',
+                        help='logging level (default=WARNING)')
     args = parser.parse_args()
 
     num_epochs = args.num_epochs

diff --git a/examples/text_classification/model.py b/examples/text_classification/model.py
@@ -1,5 +1,20 @@
 import torch.nn as nn
 
+r"""
+The model is composed of the embeddingbag layer and the linear layer.
+
+nn.EmbeddingBag computes the mean of 'bags' of embeddings. The text
+entries here have different lengths. nn.EmbeddingBag requires no
+padding because the lengths of sentences are saved in offsets.
+Therefore, this method is much faster than the original one
+with TorchText Iterator and Batch.
+
+Additionally, since it accumulates the average across the embeddings on the fly,
+nn.EmbeddingBag can enhance the performance and memory efficiency
+to process a sequence of tensors.
+
+"""
+
 
 class TextSentiment(nn.Module):
     def __init__(self, vocab_size, embed_dim, num_class):
@@ -15,4 +30,11 @@ def init_weights(self):
         self.fc.bias.data.zero_()
 
     def forward(self, text, offsets):
+        r"""
+        Arguments:
+            text: 1-D tensor representing a bag of text tensors
+            offsets: a list of offsets to delimit the 1-D text tensor
+                into the individual sequences.
+
+        """
         return self.fc(self.embedding(text, offsets))
diff --git a/examples/text_classification/predict.py b/examples/text_classification/predict.py
@@ -1,26 +1,40 @@
 import torch
 import sys
 import argparse
+from torchtext.data.utils import get_tokenizer
+from torchtext.data.utils import ngrams_iterator
 
-from torchtext.datasets.text_classification import text_normalize
 
+def predict(text, model, dictionary, ngrams):
+    r"""
+    The predict() function here is used to test the model on a sample text.
+    The input text is numericalized with the vocab and then sent to
+    the model for inference.
 
-def predict(text, model, dictionary):
+    Arguments:
+        text: a sample text string
+        model: the trained model
+        dictionary: a vocab object for the information of string-to-index
+        ngrams: the number of ngrams.
+    """
+    tokenizer = get_tokenizer("basic_english")
     with torch.no_grad():
-        text = torch.tensor([dictionary.get(token, dictionary['<unk>'])
-                             for token in text_normalize(text)])
+        text = torch.tensor([dictionary[token]
+                            for token in ngrams_iterator(tokenizer(text), ngrams)])
         output = model(text, torch.tensor([0]))
         return output.argmax(1).item() + 1
 
 
 if __name__ == "__main__":
     parser = argparse.ArgumentParser(
         description='Predict text from stdin given model and dictionary')
-    parser.add_argument('model')
-    parser.add_argument('dictionary')
+    parser.add_argument('model', help='the path for model')
+    parser.add_argument('dictionary', help='the path for dictionary')
+    parser.add_argument('--ngrams', type=int, default=2,
+                        help='ngrams (default=2)')
     args = parser.parse_args()
 
     model = torch.load(args.model)
     dictionary = torch.load(args.dictionary)
     for line in sys.stdin:
-        print(predict(line, model, dictionary))
+        print(predict(line, model, dictionary, args.ngrams))