pytorch · cpuhrsch · Jul 26, 2019 · Jul 9, 2019 · Jul 11, 2019 · Jul 9, 2019
diff --git a/test/data/test_builtin_datasets.py b/test/data/test_builtin_datasets.py
@@ -55,9 +55,9 @@ def test_text_classification(self):
         datadir = os.path.join(self.project_root, ".data")
         if not os.path.exists(datadir):
             os.mkdir(datadir)
-        ag_news_cls = AG_NEWS(root=datadir, ngrams=3)
-        self.assertEqual(len(ag_news_cls.train_examples), 120000)
-        self.assertEqual(len(ag_news_cls.test_examples), 7600)
+        ag_news_train, ag_news_test = AG_NEWS(root=datadir, ngrams=3)
+        self.assertEqual(len(ag_news_train), 120000)
+        self.assertEqual(len(ag_news_test), 7600)
 
         # Delete the dataset after we're done to save disk space on CI
         if os.environ.get("TRAVIS") == "true":

diff --git a/test/data/test_utils.py b/test/data/test_utils.py
@@ -1,35 +1,42 @@
 import six
 import torchtext.data as data
-
+import pytest
 from ..common.torchtext_test_case import TorchtextTestCase
 
 
 class TestUtils(TorchtextTestCase):
-    def test_get_tokenizer(self):
+    TEST_STR = "A string, particularly one with slightly complex punctuation."
+
+    def test_get_tokenizer_split(self):
         # Test the default case with str.split
         assert data.get_tokenizer(str.split) == str.split
-        test_str = "A string, particularly one with slightly complex punctuation."
-        assert data.get_tokenizer(str.split)(test_str) == str.split(test_str)
+        assert data.get_tokenizer(str.split)(self.TEST_STR) == str.split(self.TEST_STR)
 
+    def test_get_tokenizer_spacy(self):
         # Test SpaCy option, and verify it properly handles punctuation.
-        assert data.get_tokenizer("spacy")(six.text_type(test_str)) == [
+        assert data.get_tokenizer("spacy")(six.text_type(self.TEST_STR)) == [
             "A", "string", ",", "particularly", "one", "with", "slightly",
             "complex", "punctuation", "."]
 
+    # TODO: Remove this once issue was been resolved.
+    @pytest.mark.skip(reason=("Impractically slow! "
+                              "https://github.com/alvations/sacremoses/issues/61"))
+    def test_get_tokenizer_moses(self):
         # Test Moses option.
         # Note that internally, MosesTokenizer converts to unicode if applicable
         moses_tokenizer = data.get_tokenizer("moses")
-        assert moses_tokenizer(test_str) == [
+        assert moses_tokenizer(self.TEST_STR) == [
             "A", "string", ",", "particularly", "one", "with", "slightly",
             "complex", "punctuation", "."]
 
         # Nonbreaking prefixes should tokenize the final period.
         assert moses_tokenizer(six.text_type("abc def.")) == ["abc", "def", "."]
 
+    def test_get_tokenizer_toktokt(self):
         # Test Toktok option. Test strings taken from NLTK doctests.
         # Note that internally, MosesTokenizer converts to unicode if applicable
         toktok_tokenizer = data.get_tokenizer("toktok")
-        assert toktok_tokenizer(test_str) == [
+        assert toktok_tokenizer(self.TEST_STR) == [
             "A", "string", ",", "particularly", "one", "with", "slightly",
             "complex", "punctuation", "."]