Skip to content

Commit f1fac7a

Browse files
committed
missing download script
1 parent ed82d42 commit f1fac7a

File tree

1 file changed

+25
-1
lines changed

1 file changed

+25
-1
lines changed
Lines changed: 25 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -1 +1,25 @@
1-
URL = "http://people.csail.mit.edu/jrennie/20Newsgroups/20news-bydate.tar.gz"
1+
"""Script to download the 20 newsgroups text classification set"""
2+
3+
import os
4+
import urllib
5+
import tarfile
6+
7+
URL = ("http://people.csail.mit.edu/jrennie/"
8+
"20Newsgroups/20news-bydate.tar.gz")
9+
10+
ARCHIVE_NAME = "20news-bydate.tar.gz"
11+
TRAIN_FOLDER = "20news-bydate-train"
12+
TEST_FOLDER = "20news-bydate-test"
13+
14+
15+
if not os.path.exists(TRAIN_FOLDER) or not os.path.exists(TEST_FOLDER):
16+
17+
if not os.path.exists(ARCHIVE_NAME):
18+
print "Downloading dataset from %s (14 MB)" % URL
19+
opener = urllib.urlopen(URL)
20+
open(ARCHIVE_NAME, 'wb').write(opener.read())
21+
22+
print "Decompressing %s" % ARCHIVE_NAME
23+
tarfile.open(ARCHIVE_NAME, "r:gz").extractall(path='.')
24+
os.remove(ARCHIVE_NAME)
25+

0 commit comments

Comments
 (0)