Skip to content

Turn text classification datasets into functions and return training and text examples separately. #566

New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Merged
merged 59 commits into from
Jul 26, 2019
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
Show all changes
59 commits
Select commit Hold shift + click to select a range
c30ec38
add new APIs to build dataset.
Jul 9, 2019
ffd1f49
Add new datasets for text classification.
Jul 11, 2019
3b7b0e2
Add docs and examples.
Jul 9, 2019
5a31dd3
Split text_normalize out of preprocess function.
Jul 11, 2019
5efa58e
Add docs and test case.
Jul 11, 2019
844242a
Update README file.
Jul 12, 2019
b373de9
revise generate_iters() function.
Jul 22, 2019
6d5cb03
Remove TextDataset class.
Jul 22, 2019
3f0c523
Remove generate_iterators() API
Jul 22, 2019
2f20914
remove unnecessary library loading
Jul 22, 2019
57d0d03
Re-name build_vocab to build_dictionary
Jul 22, 2019
4cf4099
change build_vocab to build_dictionary.
Jul 22, 2019
c8ec403
convert two functions to the interanls.
Jul 22, 2019
0568a04
Change the API of _load_text_classification_data() function.
Jul 22, 2019
78673a5
use a static list for url.
Jul 22, 2019
58e3bac
use logging.info as print.
Jul 22, 2019
81e5a31
combine download and extract_archive
Jul 22, 2019
e05d7fe
Merge branch 'master' into new_pattern
cpuhrsch Jul 23, 2019
7ffb267
Merge branch 'new_supervised_learning_dataset' into new_pattern
Jul 23, 2019
e138fa8
examples
Jul 23, 2019
1e9f0e1
remove more
Jul 23, 2019
c746d86
less
Jul 23, 2019
fea3bad
split
Jul 24, 2019
5c90fbc
ordered dict
Jul 24, 2019
ba23ae1
Merge remote-tracking branch 'upstream/master' into tutorial
Jul 24, 2019
3df4dc1
rename
Jul 24, 2019
ea639c2
Simplifications
Jul 24, 2019
193a670
clean more
Jul 24, 2019
285a515
more efficient dictionary building
Jul 24, 2019
fc1fcc1
Merge branch 'master' into tutorial
Jul 24, 2019
3e27dcd
Reduce code
Jul 24, 2019
4678478
tar and extraction
Jul 24, 2019
2a18586
Merge branch 'additionalstuff' into tutorial
Jul 24, 2019
ee9894f
rebase
Jul 24, 2019
197c70d
remove legacy
Jul 24, 2019
bc2369f
more logging and args
Jul 25, 2019
0e81889
more
Jul 25, 2019
75fd515
small changes
Jul 25, 2019
e7ea6c2
More small changes
Jul 25, 2019
accf587
Update docs
Jul 25, 2019
5506a2e
bring back examples
Jul 25, 2019
2c8c4bf
bring back examples
Jul 25, 2019
28b0976
small fix
Jul 25, 2019
93f2f18
class to function
Jul 25, 2019
2809221
Use DataLoader
Jul 25, 2019
ae540b0
format
Jul 25, 2019
5520b51
Small test fix
Jul 25, 2019
9013bbd
Merge branch 'additionalstuff' into splitmore
Jul 25, 2019
2313ffb
More logging and nits
Jul 25, 2019
ff9132d
Use io.open
Jul 25, 2019
3a0db45
Merge branch 'additionalstuff' into splitmore
Jul 25, 2019
331bf79
flake8
Jul 25, 2019
90f8bbb
Merge branch 'additionalstuff' into splitmore
Jul 25, 2019
2460eaf
flake8
Jul 25, 2019
73772e2
flake8
Jul 25, 2019
6dcc052
Merge branch 'additionalstuff' into splitmore
Jul 25, 2019
b4e5067
flake8
Jul 25, 2019
c1dc6ae
Merge remote-tracking branch 'upstream/master' into splitmore
Jul 26, 2019
7256ffc
Deal with tests
Jul 26, 2019
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
6 changes: 3 additions & 3 deletions test/data/test_builtin_datasets.py
Original file line number Diff line number Diff line change
Expand Up @@ -55,9 +55,9 @@ def test_text_classification(self):
datadir = os.path.join(self.project_root, ".data")
if not os.path.exists(datadir):
os.mkdir(datadir)
ag_news_cls = AG_NEWS(root=datadir, ngrams=3)
self.assertEqual(len(ag_news_cls.train_examples), 120000)
self.assertEqual(len(ag_news_cls.test_examples), 7600)
ag_news_train, ag_news_test = AG_NEWS(root=datadir, ngrams=3)
self.assertEqual(len(ag_news_train), 120000)
self.assertEqual(len(ag_news_test), 7600)

# Delete the dataset after we're done to save disk space on CI
if os.environ.get("TRAVIS") == "true":
Expand Down
21 changes: 14 additions & 7 deletions test/data/test_utils.py
Original file line number Diff line number Diff line change
@@ -1,35 +1,42 @@
import six
import torchtext.data as data

import pytest
from ..common.torchtext_test_case import TorchtextTestCase


class TestUtils(TorchtextTestCase):
def test_get_tokenizer(self):
TEST_STR = "A string, particularly one with slightly complex punctuation."

def test_get_tokenizer_split(self):
# Test the default case with str.split
assert data.get_tokenizer(str.split) == str.split
test_str = "A string, particularly one with slightly complex punctuation."
assert data.get_tokenizer(str.split)(test_str) == str.split(test_str)
assert data.get_tokenizer(str.split)(self.TEST_STR) == str.split(self.TEST_STR)

def test_get_tokenizer_spacy(self):
# Test SpaCy option, and verify it properly handles punctuation.
assert data.get_tokenizer("spacy")(six.text_type(test_str)) == [
assert data.get_tokenizer("spacy")(six.text_type(self.TEST_STR)) == [
"A", "string", ",", "particularly", "one", "with", "slightly",
"complex", "punctuation", "."]

# TODO: Remove this once issue was been resolved.
@pytest.mark.skip(reason=("Impractically slow! "
"https://github.com/alvations/sacremoses/issues/61"))
def test_get_tokenizer_moses(self):
# Test Moses option.
# Note that internally, MosesTokenizer converts to unicode if applicable
moses_tokenizer = data.get_tokenizer("moses")
assert moses_tokenizer(test_str) == [
assert moses_tokenizer(self.TEST_STR) == [
"A", "string", ",", "particularly", "one", "with", "slightly",
"complex", "punctuation", "."]

# Nonbreaking prefixes should tokenize the final period.
assert moses_tokenizer(six.text_type("abc def.")) == ["abc", "def", "."]

def test_get_tokenizer_toktokt(self):
# Test Toktok option. Test strings taken from NLTK doctests.
# Note that internally, MosesTokenizer converts to unicode if applicable
toktok_tokenizer = data.get_tokenizer("toktok")
assert toktok_tokenizer(test_str) == [
assert toktok_tokenizer(self.TEST_STR) == [
"A", "string", ",", "particularly", "one", "with", "slightly",
"complex", "punctuation", "."]

Expand Down
Loading