pytorch · zhangguanheng66 · Jun 5, 2020 · Feb 13, 2019 · Apr 28, 2020 · Apr 30, 2020
diff --git a/.circleci/unittest/linux/scripts/environment.yml b/.circleci/unittest/linux/scripts/environment.yml
@@ -17,3 +17,5 @@ dependencies:
     - sphinx
     - sphinx-rtd-theme
     - tqdm
+    - https://github.com/explosion/spacy-models/releases/download/de_core_news_sm-2.2.5/de_core_news_sm-2.2.5.tar.gz#egg=de_core_news_sm==2.2.5
+    - https://github.com/explosion/spacy-models/releases/download/en_core_web_sm-2.2.5/en_core_web_sm-2.2.5.tar.gz#egg=en_core_web_sm==2.2.5
diff --git a/.circleci/unittest/windows/scripts/environment.yml b/.circleci/unittest/windows/scripts/environment.yml
@@ -18,3 +18,5 @@ dependencies:
     - tqdm
     - sentencepiece
     - future
+    - https://github.com/explosion/spacy-models/releases/download/de_core_news_sm-2.2.5/de_core_news_sm-2.2.5.tar.gz#egg=de_core_news_sm==2.2.5
+    - https://github.com/explosion/spacy-models/releases/download/en_core_web_sm-2.2.5/en_core_web_sm-2.2.5.tar.gz#egg=en_core_web_sm==2.2.5
diff --git a/docs/source/experimental_datasets.rst b/docs/source/experimental_datasets.rst
@@ -139,3 +139,31 @@ PennTreebank
 
 .. autoclass:: PennTreebank
   :members: __init__ 
+
+
+Machine Translation
+^^^^^^^^^^^^^^^^^^^
+
+Language modeling datasets are subclasses of ``TranslationDataset`` class.
+
+.. autoclass:: TranslationDataset
+  :members: __init__
+
+Multi30k
+~~~~~~~~
+
+.. autoclass:: Multi30k
+  :members: __init__
+
+IWSLT
+~~~~~
+
+.. autoclass:: IWSLT
+  :members: __init__
+
+WMT14
+~~~~~
+
+.. autoclass:: WMT14
+  :members: __init__
+
diff --git a/test/data/test_builtin_datasets.py b/test/data/test_builtin_datasets.py
@@ -1,6 +1,7 @@
 #!/user/bin/env python3
 # Note that all the tests in this module require dataset (either network access or cached)
 import os
+import glob
 import shutil
 import torchtext.data as data
 from torchtext.datasets import AG_NEWS
@@ -10,10 +11,11 @@
 
 
 def conditional_remove(f):
-    if os.path.isfile(f):
-        os.remove(f)
-    elif os.path.isdir(f):
-        shutil.rmtree(f)
+    for path in glob.glob(f):
+        if os.path.isfile(path):
+            os.remove(path)
+        elif os.path.isdir(path):
+            shutil.rmtree(path)
 
 
 class TestDataset(TorchtextTestCase):
@@ -122,3 +124,35 @@ def test_imdb(self):
         old_vocab = train_dataset.get_vocab()
         new_vocab = Vocab(counter=old_vocab.freqs, max_size=2500)
         new_train_data, new_test_data = IMDB(vocab=new_vocab)
+
+    def test_multi30k(self):
+        from torchtext.experimental.datasets.translation import Multi30k
+        # smoke test to ensure multi30k works properly
+        train_dataset, valid_dataset, test_dataset = Multi30k()
+        self.assertEqual(len(train_dataset), 29000)
+        self.assertEqual(len(valid_dataset), 1000)
+        self.assertEqual(len(test_dataset), 1014)
+
+        de_vocab, en_vocab = train_dataset.get_vocab()
+        de_tokens_ids = [
+            de_vocab[token] for token in
+            'Zwei Männer verpacken Donuts in Kunststofffolie'.split()
+        ]
+        self.assertEqual(de_tokens_ids, [19, 29, 18703, 4448, 5, 6240])
+
+        en_tokens_ids = [
+            en_vocab[token] for token in
+            'Two young White males are outside near many bushes'.split()
+        ]
+        self.assertEqual(en_tokens_ids,
+                         [17, 23, 1167, 806, 15, 55, 82, 334, 1337])
+
+        datafile = os.path.join(self.project_root, ".data", "train*")
+        conditional_remove(datafile)
+        datafile = os.path.join(self.project_root, ".data", "val*")
+        conditional_remove(datafile)
+        datafile = os.path.join(self.project_root, ".data", "test*")
+        conditional_remove(datafile)
+        datafile = os.path.join(self.project_root, ".data",
+                                "multi30k_task*.tar.gz")
+        conditional_remove(datafile)
diff --git a/test/test_utils.py b/test/test_utils.py
@@ -42,6 +42,34 @@ def test_download_extract_tar(self):
             conditional_remove(f)
         conditional_remove(archive_path)
 
+    def test_download_extract_gz(self):
+        # create root directory for downloading data
+        root = '.data'
+        if not os.path.exists(root):
+            os.makedirs(root)
+
+        # ensure archive is not already downloaded, if it is then delete
+        url = 'https://raw.githubusercontent.com/multi30k/dataset/master/data/task2/raw/val.5.en.gz'
+        target_archive_path = os.path.join(root, 'val.5.en.gz')
+        conditional_remove(target_archive_path)
+
+        # download archive and ensure is in correct location
+        archive_path = utils.download_from_url(url)
+        assert target_archive_path == archive_path
+
+        # extract files and ensure they are correct
+        files = utils.extract_archive(archive_path)
+        assert files == [os.path.join(root, 'val.5.en')]
+
+        # extract files with overwrite option True
+        files = utils.extract_archive(archive_path, overwrite=True)
+        assert files == [os.path.join(root, 'val.5.en')]
+
+        # remove files and archive
+        for f in files:
+            conditional_remove(f)
+        conditional_remove(archive_path)
+
     def test_download_extract_zip(self):
         # create root directory for downloading data
         root = '.data'

diff --git a/torchtext/experimental/datasets/raw/__init__.py b/torchtext/experimental/datasets/raw/__init__.py
@@ -1,6 +1,7 @@
 from .text_classification import AG_NEWS, SogouNews, DBpedia, YelpReviewPolarity, \
     YelpReviewFull, YahooAnswers, \
     AmazonReviewPolarity, AmazonReviewFull, IMDB
+from .translation import Multi30k, IWSLT, WMT14
 from .language_modeling import WikiText2, WikiText103, PennTreebank, WMTNewsCrawl
 
 __all__ = ['IMDB',
@@ -12,6 +13,9 @@
            'YahooAnswers',
            'AmazonReviewPolarity',
            'AmazonReviewFull',
+           'Multi30k',
+           'IWSLT',
+           'WMT14',
            'WikiText2',
            'WikiText103',
            'PennTreebank',