|
2 | 2 | import codecs
|
3 | 3 | from typing import NamedTuple, Dict
|
4 | 4 | import ir_datasets
|
5 |
| -from ir_datasets.util import TarExtractAll, RelativePath, GzipExtract |
| 5 | +from ir_datasets.util import TarExtractAll, RelativePath, GzipExtract, Migrator |
6 | 6 | from ir_datasets.datasets.base import Dataset, YamlDocumentation, FilteredQueries
|
7 | 7 | from ir_datasets.formats import TsvQueries, BaseDocs, TrecQrels, GenericDoc
|
8 | 8 | from ir_datasets.indices import PickleLz4FullStore
|
@@ -36,7 +36,7 @@ def docs_cls(self):
|
36 | 36 |
|
37 | 37 | def docs_store(self, field='doc_id'):
|
38 | 38 | return PickleLz4FullStore(
|
39 |
| - path=f'{ir_datasets.util.home_path()/NAME/self._lang}/collection.pklz4', |
| 39 | + path=f'{ir_datasets.util.home_path()/NAME/self._lang}.pklz4', |
40 | 40 | init_iter_fn=self.docs_iter,
|
41 | 41 | data_cls=self.docs_cls(),
|
42 | 42 | lookup_field=field,
|
@@ -77,9 +77,14 @@ def _init():
|
77 | 77 | 'th': ('mrtydi-v1.0-thai', 568855),
|
78 | 78 | }
|
79 | 79 |
|
| 80 | + migrator = Migrator(base_path/'irds_version.txt', 'v2', |
| 81 | + affected_files=[base_path/lang for lang in langs], |
| 82 | + message='Migrating mr-tydi (restructuring directory)') |
| 83 | + |
80 | 84 | for lang, (file_name, count_hint) in langs.items():
|
81 |
| - dlc_ds = TarExtractAll(dlc[lang], base_path/lang) |
| 85 | + dlc_ds = TarExtractAll(dlc[lang], f'{base_path/lang}.data') |
82 | 86 | docs = MrTydiDocs(GzipExtract(RelativePath(dlc_ds, f'{file_name}/collection/docs.jsonl.gz')), lang, count_hint=count_hint)
|
| 87 | + docs = migrator(docs) |
83 | 88 | subsets[lang] = Dataset(
|
84 | 89 | docs,
|
85 | 90 | TsvQueries(RelativePath(dlc_ds, f'{file_name}/topic.tsv'), lang=lang),
|
|
0 commit comments