pytorch · jekbradbury · Apr 27, 2018 · Apr 12, 2018 · Apr 18, 2018 · Apr 24, 2018
diff --git a/test/babi.py b/test/babi.py
@@ -0,0 +1,50 @@
+from torchtext import datasets
+
+# en-valid
+TRAIN_NUM = [0] + [900] * 16 + [904, 905, 900, 904]
+VAL_NUM = [0] + [100] * 16 + [96, 95, 100, 96]
+TEST_NUM = [0] + [1000] * 20
+
+# Testcase 1 (joint training)
+train_iter, val_iter, test_iter = datasets.BABI20.iters(task=1, joint=True)
+assert len(train_iter.dataset) == sum(TRAIN_NUM)
+assert len(val_iter.dataset) == VAL_NUM[1]
+assert len(test_iter.dataset) == TEST_NUM[1]
+
+# Testcase 2 (only supporting)
+train_iter, val_iter, test_iter = datasets.BABI20.iters(task=1, only_supporting=True)
+assert len(train_iter.dataset) == TRAIN_NUM[2]
+assert len(val_iter.dataset) == VAL_NUM[2]
+assert len(test_iter.dataset) == TEST_NUM[2]
+
+# Testcase 3 (single task)
+for i in range(1, 21):
+    train_iter, val_iter, test_iter = datasets.BABI20.iters(task=i)
+    assert len(train_iter.dataset) == TRAIN_NUM[i]
+    assert len(val_iter.dataset) == VAL_NUM[i]
+    assert len(test_iter.dataset) == TEST_NUM[i]
+
+# en-valid-10k
+TRAIN_NUM = [0] + [9000] * 17 + [8996, 9000, 9002]
+VAL_NUM = [0] + [1000] * 17 + [1004, 1000, 998]
+TEST_NUM = [0] + [1000] * 20
+
+# Testcase 1 (joint training)
+train_iter, val_iter, test_iter = datasets.BABI20.iters(task=1, joint=True, tenK=True)
+assert len(train_iter.dataset) == sum(TRAIN_NUM)
+assert len(val_iter.dataset) == VAL_NUM[1]
+assert len(test_iter.dataset) == TEST_NUM[1]
+
+# Testcase 2 (only supporting)
+train_iter, val_iter, test_iter = datasets.BABI20.iters(task=1, only_supporting=True,
+                                                        tenK=True)
+assert len(train_iter.dataset) == TRAIN_NUM[2]
+assert len(val_iter.dataset) == VAL_NUM[2]
+assert len(test_iter.dataset) == TEST_NUM[2]
+
+# Testcase 3 (single task)
+for i in range(1, 21):
+    train_iter, val_iter, test_iter = datasets.BABI20.iters(task=i, tenK=True)
+    assert len(train_iter.dataset) == TRAIN_NUM[i]
+    assert len(val_iter.dataset) == VAL_NUM[i]
+    assert len(test_iter.dataset) == TEST_NUM[i]
diff --git a/torchtext/data/field.py b/torchtext/data/field.py
@@ -1,5 +1,6 @@
 # coding: utf8
 from collections import Counter, OrderedDict
+from itertools import chain
 import six
 import torch
 from torch.autograd import Variable
@@ -249,7 +250,10 @@ def build_vocab(self, *args, **kwargs):
             for x in data:
                 if not self.sequential:
                     x = [x]
-                counter.update(x)
+                try:
+                    counter.update(x)
+                except TypeError:
+                    counter.update(chain.from_iterable(x))
         specials = list(OrderedDict.fromkeys(
             tok for tok in [self.unk_token, self.pad_token, self.init_token,
                             self.eos_token]

diff --git a/torchtext/datasets/__init__.py b/torchtext/datasets/__init__.py
@@ -5,6 +5,7 @@
 from .sequence_tagging import SequenceTaggingDataset, UDPOS # NOQA
 from .trec import TREC
 from .imdb import IMDB
+from .babi import BABI20
 
 
 __all__ = ['LanguageModelingDataset',
@@ -19,4 +20,5 @@
            'TREC',
            'IMDB',
            'SequenceTaggingDataset',
-           'UDPOS']
+           'UDPOS',
+           'BABI20']
diff --git a/torchtext/datasets/babi.py b/torchtext/datasets/babi.py
@@ -0,0 +1,143 @@
+import os
+from io import open
+
+import torch
+from torch.autograd import Variable
+
+from ..data import Dataset, Field, Example, Iterator
+
+
+class BABI20Field(Field):
+
+    def __init__(self, memory_size, **kwargs):
+        super(BABI20Field, self).__init__(**kwargs)
+        self.memory_size = memory_size
+        self.unk_token = None
+        self.batch_first = True
+
+    def preprocess(self, x):
+        if isinstance(x, list):
+            return [super(BABI20Field, self).preprocess(s) for s in x]
+        else:
+            return super(BABI20Field, self).preprocess(x)
+
+    def pad(self, minibatch):
+        if isinstance(minibatch[0][0], list):
+            self.fix_length = max(max(len(x) for x in ex) for ex in minibatch)
+            padded = []
+            for ex in minibatch:
+                # sentences are indexed in reverse order and truncated to memory_size
+                nex = ex[::-1][:self.memory_size]
+                padded.append(
+                    super(BABI20Field, self).pad(nex) +
+                    [[self.pad_token] * self.fix_length] * (self.memory_size - len(nex)))
+            self.fix_length = None
+            return padded
+        else:
+            return super(BABI20Field, self).pad(minibatch)
+
+    def numericalize(self, arr, device=None, train=True):
+        if isinstance(arr[0][0], list):
+            tmp = [
+                super(BABI20Field, self).numericalize(x, device=device, train=train).data
+                for x in arr
+            ]
+            arr = torch.stack(tmp)
+            if device == -1:
+                if self.sequential:
+                    arr = arr.contiguous()
+            else:
+                arr = arr.cuda(device)
+            return Variable(arr, volatile=not train)
+        else:
+            return super(BABI20Field, self).numericalize(arr, device=device, train=train)
+
+
+class BABI20(Dataset):
+    urls = ['http://www.thespermwhale.com/jaseweston/babi/tasks_1-20_v1-2.tar.gz']
+    name = ''
+    dirname = ''
+
+    def __init__(self, path, text_field, only_supporting=False, **kwargs):
+        fields = [('story', text_field), ('query', text_field), ('answer', text_field)]
+        self.sort_key = lambda x: len(x.query)
+
+        with open(path, 'r', encoding="utf-8") as f:
+            triplets = self._parse(f, only_supporting)
+            examples = [Example.fromlist(triplet, fields) for triplet in triplets]
+
+        super(BABI20, self).__init__(examples, fields, **kwargs)
+
+    @staticmethod
+    def _parse(file, only_supporting):
+        data, story = [], []
+        for line in file:
+            tid, text = line.rstrip('\n').split(' ', 1)
+            if tid == '1':
+                story = []
+            # sentence
+            if text.endswith('.'):
+                story.append(text[:-1])
+            # question
+            else:
+                # remove any leading or trailing whitespace after splitting
+                query, answer, supporting = (x.strip() for x in text.split('\t'))
+                if only_supporting:
+                    substory = [story[int(i) - 1] for i in supporting.split()]
+                else:
+                    substory = [x for x in story if x]
+                data.append((substory, query[:-1], answer))    # remove '?'
+                story.append("")
+        return data
+
+    @classmethod
+    def splits(cls, text_field, path=None, root='.data', task=1, joint=False, tenK=False,
+               only_supporting=False, train=None, validation=None, test=None, **kwargs):
+        assert isinstance(task, int) and 1 <= task <= 20
+        if tenK:
+            cls.dirname = os.path.join('tasks_1-20_v1-2', 'en-valid-10k')
+        else:
+            cls.dirname = os.path.join('tasks_1-20_v1-2', 'en-valid')
+        if path is None:
+            path = cls.download(root)
+        if train is None:
+            if joint:    # put all tasks together for joint learning
+                train = 'all_train.txt'
+                if not os.path.isfile(os.path.join(path, train)):
+                    with open(os.path.join(path, train), 'w') as tf:
+                        for task in range(1, 21):
+                            with open(
+                                    os.path.join(path,
+                                                 'qa' + str(task) + '_train.txt')) as f:
+                                tf.write(f.read())
+            else:
+                train = 'qa' + str(task) + '_train.txt'
+        if validation is None:
+            if joint:    # put all tasks together for joint learning
+                validation = 'all_valid.txt'
+                if not os.path.isfile(os.path.join(path, validation)):
+                    with open(os.path.join(path, validation), 'w') as tf:
+                        for task in range(1, 21):
+                            with open(
+                                    os.path.join(path,
+                                                 'qa' + str(task) + '_valid.txt')) as f:
+                                tf.write(f.read())
+            else:
+                validation = 'qa' + str(task) + '_valid.txt'
+        if test is None:
+            test = 'qa' + str(task) + '_test.txt'
+        return super(BABI20,
+                     cls).splits(path=path, root=root, text_field=text_field, train=train,
+                                 validation=validation, test=test, **kwargs)
+
+    @classmethod
+    def iters(cls, batch_size=32, root='.data', memory_size=50, task=1, joint=False,
+              tenK=False, only_supporting=False, sort=False, shuffle=False, device=None,
+              **kwargs):
+        text = BABI20Field(memory_size)
+        train, val, test = BABI20.splits(text, root=root, task=task, joint=joint,
+                                         tenK=tenK, only_supporting=only_supporting,
+                                         **kwargs)
+        text.build_vocab(train)
+        return Iterator.splits((train, val, test), batch_size=batch_size, sort=sort,
+                               shuffle=shuffle, device=device)
diff --git a/torchtext/utils.py b/torchtext/utils.py
@@ -1,5 +1,4 @@
 import six
-from six.moves import urllib
 import requests
 import csv
 
@@ -27,10 +26,10 @@ def inner(b=1, bsize=1, tsize=None):
 def download_from_url(url, path):
     """Download file, with logic (from tensor2tensor) for Google Drive"""
     if 'drive.google.com' not in url:
-        opener = urllib.request.build_opener()
-        opener.addheaders = [('User-Agent', 'Mozilla/5.0')]
-        urllib.request.install_opener(opener)
-        return urllib.request.urlretrieve(url, path)
+        r = requests.get(url, headers={'User-Agent': 'Mozilla/5.0'})
+        with open(path, "wb") as file:
+            file.write(r.content)
+        return
     print('downloading from Google Drive; may take a few minutes')
     confirm_token = None
     session = requests.Session()