Skip to content

Commit a117903

Browse files
Merge pull request allenai#122 from allenai/mr-tydi
fixed bug with mr-tydi
2 parents 2347b94 + 8c8193f commit a117903

File tree

1 file changed

+8
-3
lines changed

1 file changed

+8
-3
lines changed

ir_datasets/datasets/mr_tydi.py

Lines changed: 8 additions & 3 deletions
Original file line numberDiff line numberDiff line change
@@ -2,7 +2,7 @@
22
import codecs
33
from typing import NamedTuple, Dict
44
import ir_datasets
5-
from ir_datasets.util import TarExtractAll, RelativePath, GzipExtract
5+
from ir_datasets.util import TarExtractAll, RelativePath, GzipExtract, Migrator
66
from ir_datasets.datasets.base import Dataset, YamlDocumentation, FilteredQueries
77
from ir_datasets.formats import TsvQueries, BaseDocs, TrecQrels, GenericDoc
88
from ir_datasets.indices import PickleLz4FullStore
@@ -36,7 +36,7 @@ def docs_cls(self):
3636

3737
def docs_store(self, field='doc_id'):
3838
return PickleLz4FullStore(
39-
path=f'{ir_datasets.util.home_path()/NAME/self._lang}/collection.pklz4',
39+
path=f'{ir_datasets.util.home_path()/NAME/self._lang}.pklz4',
4040
init_iter_fn=self.docs_iter,
4141
data_cls=self.docs_cls(),
4242
lookup_field=field,
@@ -77,9 +77,14 @@ def _init():
7777
'th': ('mrtydi-v1.0-thai', 568855),
7878
}
7979

80+
migrator = Migrator(base_path/'irds_version.txt', 'v2',
81+
affected_files=[base_path/lang for lang in langs],
82+
message='Migrating mr-tydi (restructuring directory)')
83+
8084
for lang, (file_name, count_hint) in langs.items():
81-
dlc_ds = TarExtractAll(dlc[lang], base_path/lang)
85+
dlc_ds = TarExtractAll(dlc[lang], f'{base_path/lang}.data')
8286
docs = MrTydiDocs(GzipExtract(RelativePath(dlc_ds, f'{file_name}/collection/docs.jsonl.gz')), lang, count_hint=count_hint)
87+
docs = migrator(docs)
8388
subsets[lang] = Dataset(
8489
docs,
8590
TsvQueries(RelativePath(dlc_ds, f'{file_name}/topic.tsv'), lang=lang),

0 commit comments

Comments
 (0)