From 9d608b353d8b346d7ee52321090ba4d34cddd871 Mon Sep 17 00:00:00 2001 From: Guanheng Zhang Date: Wed, 12 Aug 2020 11:56:24 -0700 Subject: [PATCH 01/12] udpate download func --- torchtext/utils.py | 9 +++++++++ 1 file changed, 9 insertions(+) diff --git a/torchtext/utils.py b/torchtext/utils.py index d8348e840b..d313101fb5 100644 --- a/torchtext/utils.py +++ b/torchtext/utils.py @@ -100,6 +100,15 @@ def _process_response(r, root, filename): print("Can't create the download directory {}.".format(root)) raise + if filename is not None: + path = os.path.join(root, filename) + #skip requests.get if path exists and not overwrite. + if os.path.exists(path): + logging.info('File %s already exists.' % path) + if not overwrite: + _check_hash(path) + return path + if 'drive.google.com' not in url: response = requests.get(url, headers={'User-Agent': 'Mozilla/5.0'}, stream=True) return _process_response(response, root, filename) From 6ae9c8e17ad4990599b33ccc029f3e2a2858396a Mon Sep 17 00:00:00 2001 From: Guanheng Zhang Date: Wed, 12 Aug 2020 12:00:37 -0700 Subject: [PATCH 02/12] push glove tests back --- test/experimental/test_vectors.py | 115 +++++++++++++++--------------- 1 file changed, 57 insertions(+), 58 deletions(-) diff --git a/test/experimental/test_vectors.py b/test/experimental/test_vectors.py index fdf39a3ecd..371ccecf5b 100644 --- a/test/experimental/test_vectors.py +++ b/test/experimental/test_vectors.py @@ -203,61 +203,60 @@ def test_fast_text(self): self.assertEqual(vectors_obj[word][:3], expected_fasttext_simple_en[word]) self.assertEqual(jit_vectors_obj[word][:3], expected_fasttext_simple_en[word]) - # TODO: reenable test once the GloVe dataset url starts working - # def test_glove(self): - # # copy the asset file into the expected download location - # # note that this is just a zip file with the first 100 entries of the GloVe 840B dataset - # asset_name = 'glove.840B.300d.zip' - # asset_path = get_asset_path(asset_name) - - # with tempfile.TemporaryDirectory() as dir_name: - # data_path = os.path.join(dir_name, asset_name) - # shutil.copy(asset_path, data_path) - # vectors_obj = GloVe(root=dir_name, validate_file=False) - # jit_vectors_obj = torch.jit.script(vectors_obj) - - # # The first 3 entries in each vector. - # expected_glove = { - # 'the': [0.27204, -0.06203, -0.1884], - # 'people': [-0.19686, 0.11579, -0.41091], - # } - - # for word in expected_glove.keys(): - # self.assertEqual(vectors_obj[word][:3], expected_glove[word]) - # self.assertEqual(jit_vectors_obj[word][:3], expected_glove[word]) - - # def test_glove_different_dims(self): - # # copy the asset file into the expected download location - # # note that this is just a zip file with 1 line txt files used to test that the - # # correct files are being loaded - # asset_name = 'glove.6B.zip' - # asset_path = get_asset_path(asset_name) - - # with tempfile.TemporaryDirectory() as dir_name: - # data_path = os.path.join(dir_name, asset_name) - # shutil.copy(asset_path, data_path) - - # glove_50d = GloVe(name='6B', dim=50, root=dir_name, validate_file=False) - # glove_100d = GloVe(name='6B', dim=100, root=dir_name, validate_file=False) - # glove_200d = GloVe(name='6B', dim=200, root=dir_name, validate_file=False) - # glove_300d = GloVe(name='6B', dim=300, root=dir_name, validate_file=False) - # vectors_objects = [glove_50d, glove_100d, glove_200d, glove_300d] - - # # The first 3 entries in each vector. - # expected_glove_50d = { - # 'the': [0.418, 0.24968, -0.41242], - # } - # expected_glove_100d = { - # 'the': [-0.038194, -0.24487, 0.72812], - # } - # expected_glove_200d = { - # 'the': [-0.071549, 0.093459, 0.023738], - # } - # expected_glove_300d = { - # 'the': [0.04656, 0.21318, -0.0074364], - # } - # expected_gloves = [expected_glove_50d, expected_glove_100d, expected_glove_200d, expected_glove_300d] - - # for vectors_obj, expected_glove in zip(vectors_objects, expected_gloves): - # for word in expected_glove.keys(): - # self.assertEqual(vectors_obj[word][:3], expected_glove[word]) + def test_glove(self): + # copy the asset file into the expected download location + # note that this is just a zip file with the first 100 entries of the GloVe 840B dataset + asset_name = 'glove.840B.300d.zip' + asset_path = get_asset_path(asset_name) + + with tempfile.TemporaryDirectory() as dir_name: + data_path = os.path.join(dir_name, asset_name) + shutil.copy(asset_path, data_path) + vectors_obj = GloVe(root=dir_name, validate_file=False) + jit_vectors_obj = torch.jit.script(vectors_obj) + + # The first 3 entries in each vector. + expected_glove = { + 'the': [0.27204, -0.06203, -0.1884], + 'people': [-0.19686, 0.11579, -0.41091], + } + + for word in expected_glove.keys(): + self.assertEqual(vectors_obj[word][:3], expected_glove[word]) + self.assertEqual(jit_vectors_obj[word][:3], expected_glove[word]) + + def test_glove_different_dims(self): + # copy the asset file into the expected download location + # note that this is just a zip file with 1 line txt files used to test that the + # correct files are being loaded + asset_name = 'glove.6B.zip' + asset_path = get_asset_path(asset_name) + + with tempfile.TemporaryDirectory() as dir_name: + data_path = os.path.join(dir_name, asset_name) + shutil.copy(asset_path, data_path) + + glove_50d = GloVe(name='6B', dim=50, root=dir_name, validate_file=False) + glove_100d = GloVe(name='6B', dim=100, root=dir_name, validate_file=False) + glove_200d = GloVe(name='6B', dim=200, root=dir_name, validate_file=False) + glove_300d = GloVe(name='6B', dim=300, root=dir_name, validate_file=False) + vectors_objects = [glove_50d, glove_100d, glove_200d, glove_300d] + + # The first 3 entries in each vector. + expected_glove_50d = { + 'the': [0.418, 0.24968, -0.41242], + } + expected_glove_100d = { + 'the': [-0.038194, -0.24487, 0.72812], + } + expected_glove_200d = { + 'the': [-0.071549, 0.093459, 0.023738], + } + expected_glove_300d = { + 'the': [0.04656, 0.21318, -0.0074364], + } + expected_gloves = [expected_glove_50d, expected_glove_100d, expected_glove_200d, expected_glove_300d] + + for vectors_obj, expected_glove in zip(vectors_objects, expected_gloves): + for word in expected_glove.keys(): + self.assertEqual(vectors_obj[word][:3], expected_glove[word]) From 787acfee926680da0c7ac4ffa0df11e4d40a207b Mon Sep 17 00:00:00 2001 From: Guanheng Zhang Date: Wed, 12 Aug 2020 12:26:19 -0700 Subject: [PATCH 03/12] flake8 --- torchtext/utils.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/torchtext/utils.py b/torchtext/utils.py index d313101fb5..1e4974df21 100644 --- a/torchtext/utils.py +++ b/torchtext/utils.py @@ -102,7 +102,7 @@ def _process_response(r, root, filename): if filename is not None: path = os.path.join(root, filename) - #skip requests.get if path exists and not overwrite. + # skip requests.get if path exists and not overwrite. if os.path.exists(path): logging.info('File %s already exists.' % path) if not overwrite: From 40d51c756c1771441956c61a53d491858641e88b Mon Sep 17 00:00:00 2001 From: Guanheng Zhang Date: Wed, 12 Aug 2020 16:11:35 -0700 Subject: [PATCH 04/12] add no download test --- test/test_utils.py | 9 +++++++++ 1 file changed, 9 insertions(+) diff --git a/test/test_utils.py b/test/test_utils.py index 73ee992de3..3d205794a3 100644 --- a/test/test_utils.py +++ b/test/test_utils.py @@ -104,6 +104,15 @@ def test_download_extract_zip(self): os.rmdir(os.path.join(root, 'en-ud-v2')) conditional_remove(archive_path) + def test_no_download(self): + asset_name = 'glove.840B.300d.zip' + asset_path = get_asset_path(asset_name) + with tempfile.TemporaryDirectory() as dir_name: + data_path = os.path.join(dir_name, '.data', asset_name) + shutil.copy(asset_path, data_path) + file_path = download_from_url('fakedownload/glove.840B.300d.zip') + assertEqual(file_path, data_path) + def test_download_extract_to_path(self): # create root directory for downloading data root = '.data' From 1532fc584cac44578cdf1984647be677d1418db9 Mon Sep 17 00:00:00 2001 From: Guanheng Zhang Date: Wed, 12 Aug 2020 16:40:12 -0700 Subject: [PATCH 05/12] flake8 --- test/test_utils.py | 3 +++ 1 file changed, 3 insertions(+) diff --git a/test/test_utils.py b/test/test_utils.py index 3d205794a3..821d85e105 100644 --- a/test/test_utils.py +++ b/test/test_utils.py @@ -3,6 +3,9 @@ import os from torchtext import utils from .common.torchtext_test_case import TorchtextTestCase +from test.common.assets import get_asset_path +import tempfile +import shutil def conditional_remove(f): From 1094d9e42c1f8fe6bdcff08ce0c157d23a0aeac8 Mon Sep 17 00:00:00 2001 From: Guanheng Zhang Date: Wed, 12 Aug 2020 18:14:26 -0700 Subject: [PATCH 06/12] checkpoint --- test/test_utils.py | 9 --------- 1 file changed, 9 deletions(-) diff --git a/test/test_utils.py b/test/test_utils.py index 821d85e105..b6935c77bb 100644 --- a/test/test_utils.py +++ b/test/test_utils.py @@ -107,15 +107,6 @@ def test_download_extract_zip(self): os.rmdir(os.path.join(root, 'en-ud-v2')) conditional_remove(archive_path) - def test_no_download(self): - asset_name = 'glove.840B.300d.zip' - asset_path = get_asset_path(asset_name) - with tempfile.TemporaryDirectory() as dir_name: - data_path = os.path.join(dir_name, '.data', asset_name) - shutil.copy(asset_path, data_path) - file_path = download_from_url('fakedownload/glove.840B.300d.zip') - assertEqual(file_path, data_path) - def test_download_extract_to_path(self): # create root directory for downloading data root = '.data' From d7e1e80e4aefb08cd7de9cdc02bed834ed35bbd4 Mon Sep 17 00:00:00 2001 From: Guanheng Zhang Date: Wed, 12 Aug 2020 18:15:26 -0700 Subject: [PATCH 07/12] checkpoint --- test/test_utils.py | 3 --- 1 file changed, 3 deletions(-) diff --git a/test/test_utils.py b/test/test_utils.py index b6935c77bb..73ee992de3 100644 --- a/test/test_utils.py +++ b/test/test_utils.py @@ -3,9 +3,6 @@ import os from torchtext import utils from .common.torchtext_test_case import TorchtextTestCase -from test.common.assets import get_asset_path -import tempfile -import shutil def conditional_remove(f): From 34967f9f880f79c0c4ef83d6fc1db5e16ced75a0 Mon Sep 17 00:00:00 2001 From: Guanheng Zhang Date: Thu, 13 Aug 2020 07:16:58 -0700 Subject: [PATCH 08/12] checkpoint --- test/test_utils.py | 12 ++++++++++++ 1 file changed, 12 insertions(+) diff --git a/test/test_utils.py b/test/test_utils.py index 73ee992de3..a8b9e39ce7 100644 --- a/test/test_utils.py +++ b/test/test_utils.py @@ -3,6 +3,9 @@ import os from torchtext import utils from .common.torchtext_test_case import TorchtextTestCase +from test.common.assets import get_asset_path +import tempfile +import shutil def conditional_remove(f): @@ -104,6 +107,15 @@ def test_download_extract_zip(self): os.rmdir(os.path.join(root, 'en-ud-v2')) conditional_remove(archive_path) + def test_no_download(self): + asset_name = 'glove.840B.300d.zip' + asset_path = get_asset_path(asset_name) + os.makedirs('.data') + data_path = os.path.join('.data', asset_name) + shutil.copy(asset_path, data_path) + file_path = download_from_url('fakedownload/glove.840B.300d.zip') + assertEqual(file_path, data_path) + def test_download_extract_to_path(self): # create root directory for downloading data root = '.data' From 7b548e5fc8a4c98c50e1e95512c85cfccced243d Mon Sep 17 00:00:00 2001 From: Guanheng Zhang Date: Thu, 13 Aug 2020 07:29:50 -0700 Subject: [PATCH 09/12] checkpoint --- test/test_utils.py | 4 +++- 1 file changed, 3 insertions(+), 1 deletion(-) diff --git a/test/test_utils.py b/test/test_utils.py index a8b9e39ce7..31dbae98c1 100644 --- a/test/test_utils.py +++ b/test/test_utils.py @@ -110,7 +110,9 @@ def test_download_extract_zip(self): def test_no_download(self): asset_name = 'glove.840B.300d.zip' asset_path = get_asset_path(asset_name) - os.makedirs('.data') + root = '.data' + if not os.path.exists(root): + os.makedirs(root) data_path = os.path.join('.data', asset_name) shutil.copy(asset_path, data_path) file_path = download_from_url('fakedownload/glove.840B.300d.zip') From 99bbd82eac76b9573048fbc3f13ce295c765fbf4 Mon Sep 17 00:00:00 2001 From: Guanheng Zhang Date: Thu, 13 Aug 2020 07:30:19 -0700 Subject: [PATCH 10/12] checkpoint --- test/test_utils.py | 1 - 1 file changed, 1 deletion(-) diff --git a/test/test_utils.py b/test/test_utils.py index 31dbae98c1..d2b353c4a9 100644 --- a/test/test_utils.py +++ b/test/test_utils.py @@ -4,7 +4,6 @@ from torchtext import utils from .common.torchtext_test_case import TorchtextTestCase from test.common.assets import get_asset_path -import tempfile import shutil From 5af2c46176ab7f9f678f708bd1aaf1185beaf318 Mon Sep 17 00:00:00 2001 From: Guanheng Zhang Date: Thu, 13 Aug 2020 07:50:08 -0700 Subject: [PATCH 11/12] checkpoint --- test/test_utils.py | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) diff --git a/test/test_utils.py b/test/test_utils.py index d2b353c4a9..5cab8b92de 100644 --- a/test/test_utils.py +++ b/test/test_utils.py @@ -114,8 +114,9 @@ def test_no_download(self): os.makedirs(root) data_path = os.path.join('.data', asset_name) shutil.copy(asset_path, data_path) - file_path = download_from_url('fakedownload/glove.840B.300d.zip') + file_path = utils.download_from_url('fakedownload/glove.840B.300d.zip') assertEqual(file_path, data_path) + conditional_remove(data_path) def test_download_extract_to_path(self): # create root directory for downloading data From 3b7608e0f2a51d3a9bc6c80d4dd5d133ce9120f3 Mon Sep 17 00:00:00 2001 From: Guanheng Zhang Date: Thu, 13 Aug 2020 08:08:01 -0700 Subject: [PATCH 12/12] checkpoint --- test/test_utils.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/test/test_utils.py b/test/test_utils.py index 5cab8b92de..1f9f75369c 100644 --- a/test/test_utils.py +++ b/test/test_utils.py @@ -115,7 +115,7 @@ def test_no_download(self): data_path = os.path.join('.data', asset_name) shutil.copy(asset_path, data_path) file_path = utils.download_from_url('fakedownload/glove.840B.300d.zip') - assertEqual(file_path, data_path) + self.assertEqual(file_path, data_path) conditional_remove(data_path) def test_download_extract_to_path(self):