Skip to content

Commit a1797bb

Browse files
Merge pull request allenai#139 from allenai/robust04-fix
.z compression support for robust04
2 parents 7f9f8e9 + a06d99b commit a1797bb

File tree

4 files changed

+16
-3
lines changed

4 files changed

+16
-3
lines changed

.github/workflows/push.yml

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -14,7 +14,7 @@ jobs:
1414
strategy:
1515
fail-fast: false
1616
matrix:
17-
python-version: [3.6, 3.7, 3.8, 3.9]
17+
python-version: [3.7, 3.8, 3.9]
1818
os: ['ubuntu-latest', 'windows-latest', 'macOs-latest']
1919
architecture: ['x64']
2020

ir_datasets/formats/trec.py

Lines changed: 7 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -113,9 +113,15 @@ def docs_iter(self):
113113

114114
def _docs_iter(self, path):
115115
if Path(path).is_file():
116-
if str(path).endswith('.gz'):
116+
path_suffix = Path(path).suffix.lower()
117+
if path_suffix == '.gz':
117118
with gzip.open(path, 'rb') as f:
118119
yield from self._parser(f)
120+
elif path_suffix in ['.z', '.0z', '.1z', '.2z']:
121+
# unix "compress" command encoding
122+
unlzw3 = ir_datasets.lazy_libs.unlzw3()
123+
with io.BytesIO(unlzw3.unlzw(path)) as f:
124+
yield from self._parser(f)
119125
else:
120126
with path.open('rb') as f:
121127
yield from self._parser(f)

ir_datasets/lazy_libs.py

Lines changed: 7 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -102,4 +102,10 @@ def pyautocorpus():
102102
if 'pyautocorpus' not in _cache:
103103
import pyautocorpus
104104
_cache['pyautocorpus'] = pyautocorpus
105-
return _cache['pyautocorpus']
105+
return _cache['pyautocorpus']
106+
107+
def unlzw3():
108+
if 'unlzw3' not in _cache:
109+
import unlzw3
110+
_cache['unlzw3'] = unlzw3
111+
return _cache['unlzw3']

requirements.txt

Lines changed: 1 addition & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -11,3 +11,4 @@ warc3-wet-clueweb09>=0.2.5
1111
zlib-state>=0.1.3
1212
ijson>=3.1.3
1313
pyautocorpus>=0.1.1
14+
unlzw3>=0.2.1

0 commit comments

Comments
 (0)