Skip to content

Commit 970edfb

Browse files
author
Christian Puhrsch
committed
Merge branch 'master' of github.com:pytorch/text into docstring1
2 parents 427df86 + 3e05f58 commit 970edfb

File tree

3 files changed

+28
-0
lines changed

3 files changed

+28
-0
lines changed

test/asset/raw_datasets.json

Lines changed: 1 addition & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -44,3 +44,4 @@
4444
{"dataset_name": "SQuAD1", "split": "dev", "NUM_LINES": 10570, "MD5": {"train": "981b29407e0affa3b1b156f72073b945", "dev": "3e85deb501d4e538b6bc56f786231552"}, "URL": {"train": "https://rajpurkar.github.io/SQuAD-explorer/dataset/train-v1.1.json", "dev": "https://rajpurkar.github.io/SQuAD-explorer/dataset/dev-v1.1.json"}, "first_line": ["Super Bowl 50 was an American football game to determine the champion of the National Football League (NFL) for the 2015 season. The American Football Conference (AFC) champion Denver Broncos defeated the National Football Conference (NFC) champion Carolina Panthers 24\u201310 to earn their third Super Bowl title. The game was played on February 7, 2016, at Levi's Stadium in the San Francisco Bay Area at Santa Clara, California. As this was the 50th Super Bowl, the league emphasized the \"golden anniversary\" with various gold-themed initiatives, as well as temporarily suspending the tradition of naming each Super Bowl game with Roman numerals (under which the game would have been known as \"Super Bowl L\"), so that the logo could prominently feature the Arabic numerals 50.", "Which NFL team represented the AFC at Super Bowl 50?", ["Denver Broncos", "Denver Broncos", "Denver Broncos"], [177, 177, 177]]}
4545
{"dataset_name": "SQuAD2", "split": "train", "NUM_LINES": 130319, "MD5": {"train": "62108c273c268d70893182d5cf8df740", "dev": "246adae8b7002f8679c027697b0b7cf8"}, "URL": {"train": "https://rajpurkar.github.io/SQuAD-explorer/dataset/train-v2.0.json", "dev": "https://rajpurkar.github.io/SQuAD-explorer/dataset/dev-v2.0.json"}, "first_line": ["Beyonc\u00e9 Giselle Knowles-Carter (/bi\u02d0\u02c8j\u0252nse\u026a/ bee-YON-say) (born September 4, 1981) is an American singer, songwriter, record producer and actress. Born and raised in Houston, Texas, she performed in various singing and dancing competitions as a child, and rose to fame in the late 1990s as lead singer of R&B girl-group Destiny's Child. Managed by her father, Mathew Knowles, the group became one of the world's best-selling girl groups of all time. Their hiatus saw the release of Beyonc\u00e9's debut album, Dangerously in Love (2003), which established her as a solo artist worldwide, earned five Grammy Awards and featured the Billboard Hot 100 number-one singles \"Crazy in Love\" and \"Baby Boy\".", "When did Beyonce start becoming popular?", ["in the late 1990s"], [269]]}
4646
{"dataset_name": "SQuAD2", "split": "dev", "NUM_LINES": 11873, "MD5": {"train": "62108c273c268d70893182d5cf8df740", "dev": "246adae8b7002f8679c027697b0b7cf8"}, "URL": {"train": "https://rajpurkar.github.io/SQuAD-explorer/dataset/train-v2.0.json", "dev": "https://rajpurkar.github.io/SQuAD-explorer/dataset/dev-v2.0.json"}, "first_line": ["The Normans (Norman: Nourmands; French: Normands; Latin: Normanni) were the people who in the 10th and 11th centuries gave their name to Normandy, a region in France. They were descended from Norse (\"Norman\" comes from \"Norseman\") raiders and pirates from Denmark, Iceland and Norway who, under their leader Rollo, agreed to swear fealty to King Charles III of West Francia. Through generations of assimilation and mixing with the native Frankish and Roman-Gaulish populations, their descendants would gradually merge with the Carolingian-based cultures of West Francia. The distinct cultural and ethnic identity of the Normans emerged initially in the first half of the 10th century, and it continued to evolve over the succeeding centuries.", "In what country is Normandy located?", ["France", "France", "France", "France"], [159, 159, 159, 159]]}
47+
{"dataset_name": "EnWik9", "split": "train", "NUM_LINES": 13147026, "MD5": "3e773f8a1577fda2e27f871ca17f31fd", "URL": "http://mattmahoney.net/dc/enwik9.zip", "first_line": "<mediawiki xmlns=\"http://www.mediawiki.org/xml/export-0.3/\" xmlns:xsi=\"http://www.w3.org/2001/XMLSchema-instance\" xsi:schemaLocation=\"http://www.mediawiki.org/xml/export-0.3/ http://www.mediawiki.org/xml/export-0.3.xsd\" version=\"0.3\" xml:lang=\"en\">\n"}

torchtext/datasets/__init__.py

Lines changed: 2 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -4,6 +4,7 @@
44
from .amazonreviewpolarity import AmazonReviewPolarity
55
from .conll2000chunking import CoNLL2000Chunking
66
from .dbpedia import DBpedia
7+
from .enwik9 import EnWik9
78
from .imdb import IMDB
89
from .iwslt import IWSLT
910
from .multi30k import Multi30k
@@ -26,6 +27,7 @@
2627
'AmazonReviewPolarity': AmazonReviewPolarity,
2728
'CoNLL2000Chunking': CoNLL2000Chunking,
2829
'DBpedia': DBpedia,
30+
'EnWik9': EnWik9,
2931
'IMDB': IMDB,
3032
'IWSLT': IWSLT,
3133
'Multi30k': Multi30k,

torchtext/datasets/enwik9.py

Lines changed: 25 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,25 @@
1+
import logging
2+
from torchtext.utils import download_from_url, extract_archive
3+
from torchtext.datasets.common import RawTextIterableDataset
4+
from torchtext.datasets.common import wrap_split_argument
5+
from torchtext.datasets.common import add_docstring_header
6+
import io
7+
8+
URL = 'http://mattmahoney.net/dc/enwik9.zip'
9+
10+
MD5 = '3e773f8a1577fda2e27f871ca17f31fd'
11+
12+
NUM_LINES = {
13+
'train': 13147026
14+
}
15+
16+
17+
@wrap_split_argument
18+
@add_docstring_header()
19+
def EnWik9(root='.data', split='train', offset=0):
20+
dataset_tar = download_from_url(URL, root=root, hash_value=MD5, hash_type='md5')
21+
extracted_files = extract_archive(dataset_tar)
22+
path = extracted_files[0]
23+
logging.info('Creating {} data'.format(split[0]))
24+
return [RawTextIterableDataset('EnWik9',
25+
NUM_LINES[split[0]], iter(io.open(path, encoding="utf8")), offset=offset)]

0 commit comments

Comments
 (0)