pytorch · cpuhrsch · Jul 26, 2019 · Jul 9, 2019 · Jul 11, 2019 · Jul 9, 2019
diff --git a/.gitignore b/.gitignore
@@ -116,3 +116,7 @@ venv.bak/
 .mypy_cache/
 
 # End of https://www.gitignore.io/api/python
+
+# vim
+*.swp
+*.swo
diff --git a/examples/text_classification/model.py b/examples/text_classification/model.py
@@ -0,0 +1,19 @@
+import torch.nn as nn
+
+
+class TextSentiment(nn.Module):
+    def __init__(self, vocab_size, embed_dim, num_class):
+        super().__init__()
+        self.embedding = nn.EmbeddingBag(vocab_size, embed_dim, sparse=True)
+        self.fc = nn.Linear(embed_dim, num_class)
+        self.init_weights()
+
+    def init_weights(self):
+        initrange = 0.5
+        self.embedding.weight.data.uniform_(-initrange, initrange)
+        self.fc.weight.data.uniform_(-initrange, initrange)
+        self.fc.bias.data.zero_()
+
+    def forward(self, text, offsets):
+        embedded = self.embedding(text, offsets)
+        return self.fc(embedded)
diff --git a/examples/text_classification/predict.py b/examples/text_classification/predict.py
@@ -0,0 +1,26 @@
+import torch
+import sys
+import argparse
+
+from torchtext.datasets.text_classification import text_normalize
+
+
+def predict(text, model, dictionary):
+    with torch.no_grad():
+        text = torch.tensor([dictionary.get(token, dictionary['<unk>'])
+                             for token in text_normalize(text)])
+        output = model(text, torch.tensor([0]))
+        return output.argmax(1).item() + 1
+
+
+if __name__ == "__main__":
+    parser = argparse.ArgumentParser(
+        description='Predict text from stdin given model and dictionary')
+    parser.add_argument('model')
+    parser.add_argument('dictionary')
+    args = parser.parse_args()
+
+    model = torch.load(args.model)
+    dictionary = torch.load(args.dictionary)
+    for line in sys.stdin:
+        print(predict(line, model, dictionary))
diff --git a/examples/text_classification/train.py b/examples/text_classification/train.py
@@ -0,0 +1,105 @@
+import os
+import logging
+import random
+import argparse
+
+import torch
+
+from torchtext.datasets.text_classification import AG_NEWS
+
+from model import TextSentiment
+
+
+def generate_offsets(data_batch):
+    offsets = [0]
+    for entry in data_batch:
+        offsets.append(offsets[-1] + len(entry))
+    offsets = torch.tensor(offsets[:-1])
+    return offsets
+
+
+def generate_batch(data, labels, i, batch_size):
+    data_batch = data[i:i + batch_size]
+    text = torch.cat(data_batch)
+    offsets = generate_offsets(data_batch)
+    cls = torch.tensor(labels[i:i + batch_size])
+    text, offsets, cls = text.to(device), offsets.to(device), cls.to(device)
+    return text, offsets, cls
+
+
+def train(lr_, num_epoch, data, labels):
+    num_lines = num_epochs * len(data)
+    for epoch in range(num_epochs):
+        perm = list(range(len(data)))
+        random.shuffle(perm)
+        data = [data[i] for i in perm]
+        labels = [labels[i] for i in perm]
+
+        for i in range(0, len(data), batch_size):
+            text, offsets, cls = generate_batch(data, labels, i, batch_size)
+            output = model(text, offsets)
+            loss = criterion(output, cls)
+            loss.backward()
+            progress = (i + len(data) * epoch) / float(num_lines)
+            lr = lr_ * (1 - progress)
+            # SGD
+            for p in model.parameters():
+                p.data.add_(p.grad.data * -lr)
+                p.grad.detach_()
+                p.grad.zero_()
+    print("")
+
+
+def test(data, labels):
+    total_accuracy = []
+    for i in range(0, len(data), batch_size):
+        with torch.no_grad():
+            text, offsets, cls = generate_batch(data, labels, i, batch_size)
+            output = model(text, offsets)
+            accuracy = (output.argmax(1) == cls).float().mean().item()
+            total_accuracy.append(accuracy)
+    print("Test - Accuracy: {}".format(sum(total_accuracy) / len(total_accuracy)))
+
+
+if __name__ == "__main__":
+    parser = argparse.ArgumentParser(
+        description='Train a text classification model on AG_NEWS')
+    parser.add_argument('--num-epochs', type=int, default=3)
+    parser.add_argument('--embed-dim', type=int, default=128)
+    parser.add_argument('--batch-size', type=int, default=64)
+    parser.add_argument('--lr', type=float, default=64.0)
+    parser.add_argument('--ngrams', type=int, default=2)
+    parser.add_argument('--device', default='cpu')
+    parser.add_argument('--data', default='.data')
+    parser.add_argument('--save-model-path')
+    parser.add_argument('--save-dictionary-path')
+    parser.add_argument('--logging-level', default='WARNING')
+    args = parser.parse_args()
+
+    num_epochs = args.num_epochs
+    embed_dim = args.embed_dim
+    batch_size = args.batch_size
+    lr = args.lr
+    device = args.device
+    data = args.data
+
+    logging.basicConfig(level=getattr(logging, args.logging_level))
+
+    if not os.path.exists(data):
+        print("Creating directory {}".format(data))
+        os.mkdir(data)
+
+    dataset = AG_NEWS(root=data, ngrams=args.ngrams)
+    model = TextSentiment(len(dataset.dictionary), embed_dim,
+                          len(set(dataset.labels))).to(device)
+    criterion = torch.nn.CrossEntropyLoss().to(device)
+
+    train(lr, num_epochs, dataset.train_data, dataset.train_labels)
+    test(dataset.test_data, dataset.test_labels)
+
+    if args.save_model_path:
+        print("Saving model to {}".format(args.save_model_path))
+        torch.save(model.to('cpu'), args.save_model_path)
+    if args.save_dictionary_path:
+        print("Saving dictionary to {}".format(args.save_dictionary_path))
+        torch.save(dataset.dictionary, args.save_dictionary_path)
diff --git a/test/data/test_builtin_datasets.py b/test/data/test_builtin_datasets.py
@@ -53,6 +53,8 @@ def test_text_classification(self):
         # smoke test to ensure ag_news dataset works properly
 
         datadir = os.path.join(self.project_root, ".data")
+        if not os.path.exists(datadir):
+            os.mkdir(datadir)
         ag_news_cls = AG_NEWS(root=datadir, ngrams=3)
         self.assertEqual(len(ag_news_cls.train_examples), 120000)
         self.assertEqual(len(ag_news_cls.test_examples), 7600)

diff --git a/test/test_vocab.py b/test/test_vocab.py
@@ -145,7 +145,7 @@ def test_vocab_extend(self):
             self.assertGreater(len(v), n_vocab)
 
             self.assertEqual(v.itos[:6], ['<unk>', '<pad>', '<bos>',
-                             'ᑌᑎIᑕOᗪᕮ_Tᕮ᙭T', 'hello', 'world'])
+                                          'ᑌᑎIᑕOᗪᕮ_Tᕮ᙭T', 'hello', 'world'])
             vectors = v.vectors.numpy()
 
             # The first 5 entries in each vector.
@@ -267,7 +267,7 @@ def test_vocab_download_glove_vectors(self):
             conditional_remove(zip_file)
             for dim in ["25", "50", "100", "200"]:
                 conditional_remove(os.path.join(self.project_root, ".vector_cache",
-                                   "glove.twitter.27B.{}d.txt".format(dim)))
+                                                "glove.twitter.27B.{}d.txt".format(dim)))
 
     @slow
     def test_vocab_download_charngram_vectors(self):
@@ -355,4 +355,4 @@ def test_vectors_get_vecs(self):
             conditional_remove(zip_file)
             for dim in ["50", "100", "200", "300"]:
                 conditional_remove(os.path.join(self.project_root, ".vector_cache",
-                                   "glove.6B.{}d.txt".format(dim)))
+                                                "glove.6B.{}d.txt".format(dim)))
diff --git a/torchtext/data/iterator.py b/torchtext/data/iterator.py
@@ -62,7 +62,7 @@ def __init__(self, dataset, batch_size, sort_key=None, device=None,
         else:
             self.sort_key = sort_key
 
-        if type(device) == int:
+        if isinstance(device, int):
             logger.warning("The `device` argument should be set by using `torch.device`"
                            + " or passing a string as an argument. This behavior will be"
                            + " deprecated soon and currently defaults to cpu.")

diff --git a/torchtext/data/pipeline.py b/torchtext/data/pipeline.py
@@ -9,6 +9,7 @@ class Pipeline(object):
         pipes: The Pipelines that will be applied to input sequence
             data in order.
     """
+
     def __init__(self, convert_token=None):
         """Create a pipeline.
 

diff --git a/torchtext/datasets/nli.py b/torchtext/datasets/nli.py
@@ -17,6 +17,7 @@ class ParsedTextField(data.Field):
         Expensive tokenization could be omitted from the pipeline as
         the parse tree annotations are already in tokenized form.
     """
+
     def __init__(self, eos_token='<pad>', lower=False, reverse=False):
         if reverse:
             super(ParsedTextField, self).__init__(
-Original file line number
+Diff line change
@@ Expand Up / @@ -116,3 +116,7 @@ venv.bak/ @@
     .mypy_cache/
     # End of https://www.gitignore.io/api/python
+    # vim
+    *.swp
+    *.swo