Skip to content

Commit 8f1d800

Browse files
committed
new version of dl-hard qrels for msmarco-document
1 parent 8c5ae60 commit 8f1d800

File tree

3 files changed

+14
-11
lines changed

3 files changed

+14
-11
lines changed

ir_datasets/datasets/msmarco_document.py

Lines changed: 5 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -1,6 +1,6 @@
11
from typing import NamedTuple
22
import ir_datasets
3-
from ir_datasets.util import Cache, DownloadConfig, GzipExtract, Lazy
3+
from ir_datasets.util import Cache, DownloadConfig, GzipExtract, Lazy, Migrator
44
from ir_datasets.datasets.base import Dataset, YamlDocumentation, FilteredQueries, FilteredScoredDocs, FilteredQrels
55
from ir_datasets.formats import TrecDocs, TsvQueries, TrecQrels, TrecScoredDocs
66
from ir_datasets.datasets.msmarco_passage import DUA, QRELS_DEFS, DL_HARD_QIDS_BYFOLD, DL_HARD_QIDS
@@ -114,14 +114,17 @@ def _init():
114114
)
115115

116116
# DL-Hard
117+
dl_hard_qrels_migrator = Migrator(base_path/'trec-dl-hard'/'irds_version.txt', 'v2',
118+
affected_files=[base_path/'trec-dl-hard'/'qrels'],
119+
message='Updating trec-dl-hard qrels')
117120
hard_qids = Lazy(lambda: DL_HARD_QIDS)
118121
dl_hard_base_queries = TsvQueries([
119122
Cache(GzipExtract(dlc['trec-dl-2019/queries']), base_path/'trec-dl-2019/queries.tsv'),
120123
Cache(GzipExtract(dlc['trec-dl-2020/queries']), base_path/'trec-dl-2020/queries.tsv')], namespace='msmarco', lang='en')
121124
subsets['trec-dl-hard'] = Dataset(
122125
collection,
123126
FilteredQueries(dl_hard_base_queries, hard_qids),
124-
TrecQrels(dlc['trec-dl-hard/qrels'], TREC_DL_QRELS_DEFS),
127+
dl_hard_qrels_migrator(TrecQrels(dlc['trec-dl-hard/qrels'], TREC_DL_QRELS_DEFS)),
125128
documentation('trec-dl-hard')
126129
)
127130
hard_qids = Lazy(lambda: DL_HARD_QIDS_BYFOLD['1'])

ir_datasets/etc/downloads.yaml

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -1028,7 +1028,7 @@ msmarco-document:
10281028
cache_path: 'orcas/ms.run.gz'
10291029
trec-dl-hard/qrels:
10301030
url: 'https://raw.githubusercontent.com/grill-lab/DL-Hard/main/dataset/dl_hard-doc.qrels'
1031-
expected_md5: 'da2366aef13c11b352fe4587d96ee9f8'
1031+
expected_md5: '06dfe71d497e081a7c4c1294979edb7d'
10321032
cache_path: 'trec-dl-hard/qrels'
10331033

10341034

test/integration/msmarco_document.py

Lines changed: 8 additions & 8 deletions
Original file line numberDiff line numberDiff line change
@@ -59,10 +59,10 @@ def test_msmarco_document_queries(self):
5959
9: GenericQuery('1105792', 'define: geon'),
6060
44: GenericQuery('997622', 'where is the show shameless filmed'),
6161
})
62-
self._test_queries('msmarco-document/trec-dl-hard', count=49, items={
62+
self._test_queries('msmarco-document/trec-dl-hard', count=50, items={
6363
0: GenericQuery('1108939', 'what slows down the flow of blood'),
6464
9: GenericQuery('451602', "medicare's definition of mechanical ventilation"),
65-
48: GenericQuery('88495', 'causes of stroke?'),
65+
49: GenericQuery('88495', 'causes of stroke?'),
6666
})
6767
self._test_queries('msmarco-document/trec-dl-hard/fold1', count=10, items={
6868
0: GenericQuery('966413', 'where are the benefits of cinnamon as a supplement?'),
@@ -80,9 +80,9 @@ def test_msmarco_document_queries(self):
8080
0: GenericQuery('1108100', 'what type of movement do bacteria exhibit?'),
8181
9: GenericQuery('88495', 'causes of stroke?'),
8282
})
83-
self._test_queries('msmarco-document/trec-dl-hard/fold5', count=9, items={
83+
self._test_queries('msmarco-document/trec-dl-hard/fold5', count=10, items={
8484
0: GenericQuery('190044', 'foods to detox liver naturally'),
85-
8: GenericQuery('877809', 'what metal are hip replacements made of'),
85+
9: GenericQuery('877809', 'what metal are hip replacements made of'),
8686
})
8787

8888

@@ -122,10 +122,10 @@ def test_msmarco_document_qrels(self):
122122
9: TrecQrel('42255', 'D1168483', 0, '0'),
123123
9097: TrecQrel('1136962', 'D96742', 0, '0'),
124124
})
125-
self._test_qrels('msmarco-document/trec-dl-hard', count=8540, items={
125+
self._test_qrels('msmarco-document/trec-dl-hard', count=8544, items={
126126
0: TrecQrel('1117817', 'D192188', 0, 'Q0'),
127127
9: TrecQrel('801118', 'D579461', 1, 'Q0'),
128-
8539: TrecQrel('315637', 'D655701', 3, 'Q0'),
128+
8543: TrecQrel('273695', 'D736968', 2, 'Q0'),
129129
})
130130
self._test_qrels('msmarco-document/trec-dl-hard/fold1', count=1557, items={
131131
0: TrecQrel('1056204', 'D1891649', 1, 'Q0'),
@@ -147,10 +147,10 @@ def test_msmarco_document_qrels(self):
147147
9: TrecQrel('87452', 'D1000458', 2, 'Q0'),
148148
1053: TrecQrel('1108100', 'D3318246', 3, 'Q0'),
149149
})
150-
self._test_qrels('msmarco-document/trec-dl-hard/fold5', count=4110, items={
150+
self._test_qrels('msmarco-document/trec-dl-hard/fold5', count=4114, items={
151151
0: TrecQrel('489204', 'D1002646', 0, 'Q0'),
152152
9: TrecQrel('489204', 'D1025842', 1, 'Q0'),
153-
4109: TrecQrel('190044', 'D910793', 3, 'Q0'),
153+
4113: TrecQrel('273695', 'D736968', 2, 'Q0'),
154154
})
155155

156156
def test_msmarco_document_scoreddocs(self):

0 commit comments

Comments
 (0)