Fix ArchiveReader to keep archive path (#73)

ejguan · facebook-github-bot · commit d28a139cd99f · 2021-10-20T16:52:22.000-07:00
Summary: Pull Request resolved: #73 Previous implementation of `ArchiveReader` has a bug. Please take a reference from pytorch/pytorch#65424 (comment) Reviewed By: NivekT Differential Revision: D31797765 fbshipit-source-id: 494e1a49b43d5a846de971a67586089e6d7ebafc
diff --git a/examples/text/amazonreviewpolarity.py b/examples/text/amazonreviewpolarity.py
@@ -24,8 +24,8 @@
 _PATH = "amazon_review_polarity_csv.tar.gz"
 
 _EXTRACTED_FILES = {
-    "train": f"{os.sep}".join(["amazon_review_polarity_csv", "train.csv"]),
-    "test": f"{os.sep}".join(["amazon_review_polarity_csv", "test.csv"]),
+    "train": f"{os.sep}".join([_PATH, "amazon_review_polarity_csv", "train.csv"]),
+    "test": f"{os.sep}".join([_PATH, "amazon_review_polarity_csv", "test.csv"]),
 }
 
 _EXTRACTED_FILES_MD5 = {
diff --git a/examples/text/sst2.py b/examples/text/sst2.py
diff --git a/test/test_examples.py b/test/test_examples.py
@@ -3,11 +3,17 @@
 import sys
 import unittest
 
+from torch.testing._internal.common_utils import slowTest
 
 current = os.path.dirname(os.path.realpath(__file__))
 ROOT = os.path.dirname(current)
 sys.path.append(ROOT)
 
+from examples.text.ag_news import AG_NEWS
+from examples.text.amazonreviewpolarity import AmazonReviewPolarity
+from examples.text.imdb import IMDB
+from examples.text.squad1 import SQuAD1
+from examples.text.squad2 import SQuAD2
 from examples.vision.caltech101 import Caltech101
 from examples.vision.caltech256 import Caltech256
 
@@ -42,5 +48,33 @@ def test_Caltech256(self) -> None:
         self.assertEqual(6, len(samples))
 
 
+# TODO: Replace the following tests with the corresponding tests in TorchText
+class TestTextExamples(unittest.TestCase):
+    def _test_helper(self, fn):
+        dp = fn()
+        for stage_dp in dp:
+            _ = list(stage_dp)
+
+    @slowTest
+    def test_AG_NEWS(self) -> None:
+        self._test_helper(AG_NEWS)
+
+    @slowTest
+    def test_AmazonReviewPolarity(self) -> None:
+        self._test_helper(AmazonReviewPolarity)
+
+    @slowTest
+    def test_IMDB(self) -> None:
+        self._test_helper(IMDB)
+
+    @slowTest
+    def test_SQuAD1(self) -> None:
+        self._test_helper(SQuAD1)
+
+    @slowTest
+    def test_SQuAD2(self) -> None:
+        self._test_helper(SQuAD2)
+
+
 if __name__ == "__main__":
     unittest.main()
diff --git a/torchdata/datapipes/iter/util/tararchivereader.py b/torchdata/datapipes/iter/util/tararchivereader.py
@@ -40,7 +40,6 @@ def __iter__(self) -> Iterator[Tuple[str, BufferedIOBase]]:
         for data in self.datapipe:
             validate_pathname_binary_tuple(data)
             pathname, data_stream = data
-            folder_name = os.path.dirname(pathname)
             try:
                 # typing.cast is used here to silence mypy's type checker
                 tar = tarfile.open(fileobj=cast(Optional[IO[bytes]], data_stream), mode=self.mode)
@@ -51,7 +50,7 @@ def __iter__(self) -> Iterator[Tuple[str, BufferedIOBase]]:
                     if extracted_fobj is None:
                         warnings.warn("failed to extract file {} from source tarfile {}".format(tarinfo.name, pathname))
                         raise tarfile.ExtractError
-                    inner_pathname = os.path.normpath(os.path.join(folder_name, tarinfo.name))
+                    inner_pathname = os.path.normpath(os.path.join(pathname, tarinfo.name))
                     yield inner_pathname, StreamWrapper(extracted_fobj)  # type: ignore[misc]
             except Exception as e:
                 warnings.warn(
diff --git a/torchdata/datapipes/iter/util/ziparchivereader.py b/torchdata/datapipes/iter/util/ziparchivereader.py
@@ -39,7 +39,6 @@ def __iter__(self) -> Iterator[Tuple[str, BufferedIOBase]]:
         for data in self.datapipe:
             validate_pathname_binary_tuple(data)
             pathname, data_stream = data
-            folder_name = os.path.dirname(pathname)
             try:
                 # typing.cast is used here to silence mypy's type checker
                 zips = zipfile.ZipFile(cast(IO[bytes], data_stream))
@@ -51,7 +50,7 @@ def __iter__(self) -> Iterator[Tuple[str, BufferedIOBase]]:
                     elif zipinfo.filename.endswith("/"):
                         continue
                     extracted_fobj = zips.open(zipinfo)
-                    inner_pathname = os.path.normpath(os.path.join(folder_name, zipinfo.filename))
+                    inner_pathname = os.path.normpath(os.path.join(pathname, zipinfo.filename))
                     yield inner_pathname, StreamWrapper(extracted_fobj)  # type: ignore[misc]
             except Exception as e:
                 warnings.warn(f"Unable to extract files from corrupted zipfile stream {pathname} due to: {e}, abort!")

Original file line number	Diff line number	Diff line change
`@@ -24,8 +24,8 @@`
`24`	`24`	`_PATH = "amazon_review_polarity_csv.tar.gz"`
`25`	`25`
`26`	`26`	`_EXTRACTED_FILES = {`
`27`		`- "train": f"{os.sep}".join(["amazon_review_polarity_csv", "train.csv"]),`
`28`		`- "test": f"{os.sep}".join(["amazon_review_polarity_csv", "test.csv"]),`
	`27`	`+ "train": f"{os.sep}".join([_PATH, "amazon_review_polarity_csv", "train.csv"]),`
	`28`	`+ "test": f"{os.sep}".join([_PATH, "amazon_review_polarity_csv", "test.csv"]),`
`29`	`29`	`}`
`30`	`30`
`31`	`31`	`_EXTRACTED_FILES_MD5 = {`