diff --git a/.circleci/unittest/linux/scripts/environment.yml b/.circleci/unittest/linux/scripts/environment.yml index b60f219e16..282877c207 100644 --- a/.circleci/unittest/linux/scripts/environment.yml +++ b/.circleci/unittest/linux/scripts/environment.yml @@ -17,5 +17,5 @@ dependencies: - sphinx - sphinx-rtd-theme - tqdm - - https://github.com/explosion/spacy-models/releases/download/de_core_news_sm-2.2.5/de_core_news_sm-2.2.5.tar.gz#egg=de_core_news_sm==2.2.5 - - https://github.com/explosion/spacy-models/releases/download/en_core_web_sm-2.2.5/en_core_web_sm-2.2.5.tar.gz#egg=en_core_web_sm==2.2.5 + - https://github.com/explosion/spacy-models/releases/download/de_core_news_sm-3.0.0/de_core_news_sm-3.0.0.tar.gz#egg=de_core_news_sm==3.0.0 + - https://github.com/explosion/spacy-models/releases/download/en_core_web_sm-3.0.0/en_core_web_sm-3.0.0.tar.gz#egg=en_core_web_sm==3.0.0 diff --git a/.circleci/unittest/linux/scripts/setup_env.sh b/.circleci/unittest/linux/scripts/setup_env.sh index d4f5457906..eb075b1eb1 100755 --- a/.circleci/unittest/linux/scripts/setup_env.sh +++ b/.circleci/unittest/linux/scripts/setup_env.sh @@ -45,4 +45,6 @@ fi # 4. Download printf "* Downloading SpaCy English models\n" -python -m spacy download en +python -m spacy download en_core_web_sm +printf "* Downloading SpaCy German models\n" +python -m spacy download de_core_news_sm diff --git a/.circleci/unittest/windows/scripts/environment.yml b/.circleci/unittest/windows/scripts/environment.yml index 28790d2585..2d7c790b91 100644 --- a/.circleci/unittest/windows/scripts/environment.yml +++ b/.circleci/unittest/windows/scripts/environment.yml @@ -19,5 +19,5 @@ dependencies: - tqdm - certifi - future - - https://github.com/explosion/spacy-models/releases/download/de_core_news_sm-2.2.5/de_core_news_sm-2.2.5.tar.gz#egg=de_core_news_sm==2.2.5 - - https://github.com/explosion/spacy-models/releases/download/en_core_web_sm-2.2.5/en_core_web_sm-2.2.5.tar.gz#egg=en_core_web_sm==2.2.5 + - https://github.com/explosion/spacy-models/releases/download/de_core_news_sm-3.0.0/de_core_news_sm-3.0.0.tar.gz#egg=de_core_news_sm==3.0.0 + - https://github.com/explosion/spacy-models/releases/download/en_core_web_sm-3.0.0/en_core_web_sm-3.0.0.tar.gz#egg=en_core_web_sm==3.0.0 diff --git a/.circleci/unittest/windows/scripts/setup_env.sh b/.circleci/unittest/windows/scripts/setup_env.sh index cd44b436dc..ea99130c5e 100644 --- a/.circleci/unittest/windows/scripts/setup_env.sh +++ b/.circleci/unittest/windows/scripts/setup_env.sh @@ -39,4 +39,6 @@ conda env update --file "${this_dir}/environment.yml" --prune # 4. Download printf "* Downloading SpaCy English models\n" -python -m spacy download en +python -m spacy download en_core_web_sm +printf "* Downloading SpaCy German models\n" +python -m spacy download de_core_news_sm diff --git a/test/data/test_builtin_datasets.py b/test/data/test_builtin_datasets.py index ca53feff6a..8ed1a0c04e 100644 --- a/test/data/test_builtin_datasets.py +++ b/test/data/test_builtin_datasets.py @@ -206,14 +206,21 @@ def test_multi30k(self): from torchtext.experimental.datasets import Multi30k # smoke test to ensure multi30k works properly train_dataset, valid_dataset, test_dataset = Multi30k() + + # This change is due to the BC breaking in spacy 3.0 self._helper_test_func(len(train_dataset), 29000, train_dataset[20], - ([4, 444, 2531, 47, 17480, 7423, 8, 158, 10, 12, 5849, 3, 2], + # ([4, 444, 2531, 47, 17480, 7423, 8, 158, 10, 12, 5849, 3, 2], + ([4, 444, 2529, 47, 17490, 7422, 8, 158, 10, 12, 5846, 3, 2], [5, 61, 530, 137, 1494, 10, 9, 280, 6, 2, 3749, 4, 3])) + self._helper_test_func(len(valid_dataset), 1014, valid_dataset[30], ([4, 179, 26, 85, 1005, 57, 19, 154, 3, 2], [5, 24, 32, 81, 47, 1348, 6, 2, 119, 4, 3])) + + # This change is due to the BC breaking in spacy 3.0 self._helper_test_func(len(test_dataset), 1000, test_dataset[40], - ([4, 26, 6, 12, 3915, 1538, 21, 64, 3, 2], + # ([4, 26, 6, 12, 3915, 1538, 21, 64, 3, 2], + ([4, 26, 6, 12, 3913, 1537, 21, 64, 3, 2], [5, 32, 20, 2, 747, 345, 1915, 6, 46, 4, 3])) de_vocab, en_vocab = train_dataset.get_vocab() @@ -221,7 +228,9 @@ def test_multi30k(self): de_vocab[token] for token in 'Zwei Männer verpacken Donuts in Kunststofffolie'.split() ] - self.assertEqual(de_tokens_ids, [20, 30, 18705, 4448, 6, 6241]) + # This change is due to the BC breaking in spacy 3.0 + # self.assertEqual(de_tokens_ids, [20, 30, 18705, 4448, 6, 6241]) + self.assertEqual(de_tokens_ids, [20, 30, 18714, 4447, 6, 6239]) en_tokens_ids = [ en_vocab[token] for token in @@ -240,8 +249,11 @@ def test_multi30k(self): 'A group of men are loading cotton onto a truck\n'])) del train_iter, valid_iter train_dataset, = Multi30k(data_select=('train')) + + # This change is due to the BC breaking in spacy 3.0 self._helper_test_func(len(train_dataset), 29000, train_dataset[20], - ([4, 444, 2531, 47, 17480, 7423, 8, 158, 10, 12, 5849, 3, 2], + # ([4, 444, 2531, 47, 17480, 7423, 8, 158, 10, 12, 5849, 3, 2], + ([4, 444, 2529, 47, 17490, 7422, 8, 158, 10, 12, 5846, 3, 2], [5, 61, 530, 137, 1494, 10, 9, 280, 6, 2, 3749, 4, 3])) datafile = os.path.join(self.project_root, ".data", "train*") diff --git a/test/test_build.py b/test/test_build.py index d61e844280..2e29392562 100644 --- a/test/test_build.py +++ b/test/test_build.py @@ -107,7 +107,7 @@ class TestDataUtils(TorchtextTestCase): def test_get_tokenizer_spacy(self): # Test SpaCy option, and verify it properly handles punctuation. - assert torchtext.data.get_tokenizer("spacy")(str(self.TEST_STR)) == [ + assert torchtext.data.get_tokenizer("spacy", language='en_core_web_sm')(str(self.TEST_STR)) == [ "A", "string", ",", "particularly", "one", "with", "slightly", "complex", "punctuation", "."] diff --git a/test/translation.py b/test/translation.py index eb3c47a349..89a6f9017f 100644 --- a/test/translation.py +++ b/test/translation.py @@ -4,8 +4,8 @@ import re import spacy -spacy_de = spacy.load('de') -spacy_en = spacy.load('en') +spacy_de = spacy.load('de_core_news_sm') +spacy_en = spacy.load('en_core_web_sm') url = re.compile('(.*)') diff --git a/torchtext/data/utils.py b/torchtext/data/utils.py index 12f653290b..240a3799ff 100644 --- a/torchtext/data/utils.py +++ b/torchtext/data/utils.py @@ -2,7 +2,6 @@ from contextlib import contextmanager from copy import deepcopy import re - from functools import partial