Skip to content

Commit 6f04fc1

Browse files
author
Shuo Sun
committed
added clirmatrix datasets
1 parent e992363 commit 6f04fc1

File tree

8 files changed

+145
-9
lines changed

8 files changed

+145
-9
lines changed

ir_datasets/datasets/__init__.py

Lines changed: 1 addition & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -28,3 +28,4 @@
2828
from . import vaswani
2929
from . import wapo
3030
from . import wikir
31+
from . import clirmatrix

ir_datasets/datasets/clirmatrix.py

Lines changed: 60 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,60 @@
1+
import contextlib
2+
from pathlib import Path
3+
from typing import NamedTuple
4+
import ir_datasets
5+
from ir_datasets.util import GzipExtract, DownloadConfig_CM
6+
from ir_datasets.datasets.base import Dataset, YamlDocumentation
7+
from ir_datasets.formats import TsvDocs, CLIRMatrixQueries, CLIRMatrixQrels
8+
from tqdm import tqdm
9+
10+
NAME = 'clirmatrix'
11+
12+
_logger = ir_datasets.log.easy()
13+
14+
QRELS_DEFS = {
15+
6: "6",
16+
5: "5",
17+
4: "4",
18+
3: "3",
19+
2: "2",
20+
1: "1",
21+
0: "0",
22+
}
23+
24+
def _init():
25+
base_path = ir_datasets.util.home_path()/NAME
26+
27+
documentation = YamlDocumentation(f'docs/{NAME}.yaml')
28+
subsets = {}
29+
30+
docs = {}
31+
doc_dlc = DownloadConfig_CM.context("clirmatrix_docs", base_path)
32+
for k in doc_dlc.contents().keys():
33+
doc_lcode = k.split("/")[-1]
34+
doc = TsvDocs(GzipExtract(doc_dlc[k]), namespace=doc_lcode, lang=doc_lcode)
35+
docs[doc_lcode] = doc
36+
37+
for dataset in ["clirmatrix_multi8", "clirmatrix_bi139_base", "clirmatrix_bi139_full"]:
38+
dataset_name = dataset.split("_", 1)[-1]
39+
dlc = DownloadConfig_CM.context(dataset, base_path)
40+
for k in tqdm(dlc.contents().keys()):
41+
_, lcodes, split = k.split("/")
42+
query_lcode, doc_lcode = lcodes.split("_")
43+
qrel_dlc = GzipExtract(dlc[k])
44+
qrels = CLIRMatrixQrels(qrel_dlc, QRELS_DEFS)
45+
queries = CLIRMatrixQueries(qrel_dlc, query_lcode)
46+
subsets[f"{dataset_name}/{query_lcode}/{doc_lcode}/{split}"] = Dataset(
47+
docs[doc_lcode],
48+
qrels,
49+
queries)
50+
51+
base = Dataset(documentation('_'))
52+
53+
ir_datasets.registry.register(NAME, base)
54+
for s in sorted(subsets):
55+
ir_datasets.registry.register(f'{NAME}/{s}', subsets[s])
56+
57+
return base, subsets
58+
59+
60+
collection, subsets = _init()

ir_datasets/etc/clirmatrix_downloads.json

Lines changed: 1 addition & 0 deletions
Large diffs are not rendered by default.

ir_datasets/formats/__init__.py

Lines changed: 1 addition & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -5,3 +5,4 @@
55
from .trec import TrecDocs, TrecQueries, TrecXmlQueries, TrecColonQueries, TrecQrels, TrecPrels, TrecScoredDocs, TrecDoc, TitleUrlTextDoc, TrecQuery, TrecSubtopic, TrecQrel, TrecPrel
66
from .webarc import WarcDocs, WarcDoc
77
from .ntcir import NtcirQrels
8+
from .clirmatrix import CLIRMatrixQueries, CLIRMatrixQrels

ir_datasets/formats/clirmatrix.py

Lines changed: 47 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,47 @@
1+
import codecs
2+
import json
3+
from . import TrecQrels, TrecQrel
4+
from .base import GenericQuery, BaseQueries
5+
6+
7+
class CLIRMatrixQueries(BaseQueries):
8+
def __init__(self, streamer, query_lang):
9+
super().__init__()
10+
self._streamer = streamer
11+
self.query_lang = query_lang
12+
13+
def queries_iter(self):
14+
with self._streamer.stream() as stream:
15+
f = codecs.getreader('utf-8')(stream)
16+
for line in f:
17+
if line == '\n':
18+
continue #ignore blank lines
19+
20+
j = json.loads(line)
21+
qid = j["src_id"]
22+
query = j["src_query"]
23+
yield GenericQuery(qid, query)
24+
25+
def queries_namespace(self):
26+
return NAME
27+
28+
def queries_cls(self):
29+
return GenericQuery
30+
31+
def queries_lang(self):
32+
return self.query_lang
33+
34+
35+
class CLIRMatrixQrels(TrecQrels):
36+
def qrels_iter(self):
37+
with self._qrels_dlc.stream() as f:
38+
f = codecs.getreader('utf8')(f)
39+
for line in f:
40+
if line == '\n':
41+
continue # ignore blank lines
42+
43+
j = json.loads(line)
44+
45+
qid = j["src_id"]
46+
for did, score in j["tgt_results"]:
47+
yield TrecQrel(qid, did, int(score), '0')

ir_datasets/lazy_libs.py

Lines changed: 7 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -37,6 +37,13 @@ def yaml():
3737
return _cache['yaml']
3838

3939

40+
def json():
41+
if 'json' not in _cache:
42+
import json
43+
_cache['json'] = json
44+
return _cache['json']
45+
46+
4047
def trec_car():
4148
if 'trec_car' not in _cache:
4249
import trec_car.read_data

ir_datasets/util/__init__.py

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -7,7 +7,7 @@
77
from pathlib import Path
88
from .. import log
99
from .fileio import IterStream, Cache, TarExtract, TarExtractAll, RelativePath, GzipExtract, ZipExtract, ZipExtractCache, StringFile, ReTar, Bz2Extract
10-
from .download import Download, DownloadConfig, BaseDownload, RequestsDownload, LocalDownload
10+
from .download import Download, DownloadConfig, BaseDownload, RequestsDownload, LocalDownload, DownloadConfig_CM
1111
from .hash import HashVerificationError, HashVerifier, HashStream
1212
from .registry import Registry
1313

ir_datasets/util/download.py

Lines changed: 27 additions & 8 deletions
Original file line numberDiff line numberDiff line change
@@ -119,9 +119,10 @@ def __repr__(self):
119119

120120

121121
class LocalDownload(BaseDownload):
122-
def __init__(self, path, message=None):
122+
def __init__(self, path, message=None, mkdir=True):
123123
self._path = Path(path)
124-
self._path.parent.mkdir(parents=True, exist_ok=True)
124+
if mkdir:
125+
self._path.parent.mkdir(parents=True, exist_ok=True)
125126
self._message = message
126127

127128
def path(self):
@@ -220,23 +221,41 @@ def dua_ctxt(cls, dua):
220221

221222

222223
class _DownloadConfig:
223-
def __init__(self, file=None, base_path=None, contents=None, dua=None):
224+
def __init__(self, file=None, base_path=None, contents=None, dua=None, parser="yaml"):
224225
self._file = file
225226
self._base_path = base_path
226227
self._contents = contents
227228
self._dua = dua
229+
self._parser = parser
230+
self.home_path = None
231+
self.download_path = None
228232

229233
def contents(self):
230234
if self._contents is None:
231-
yaml = ir_datasets.lazy_libs.yaml()
232235
data = pkgutil.get_data('ir_datasets', self._file)
233-
self._contents = yaml.load(data, Loader=yaml.BaseLoader)
236+
if self._parser == "json":
237+
json = ir_datasets.lazy_libs.json()
238+
self._contents = json.loads(data)
239+
else:
240+
yaml = ir_datasets.lazy_libs.yaml()
241+
self._contents = yaml.load(data, Loader=yaml.BaseLoader)
234242
return self._contents
235243

236244
def context(self, key, base_path=None, dua=None):
237245
contents = self.contents()
238246
return _DownloadConfig(contents=contents[key] if key else contents, base_path=base_path or self._base_path, dua=dua or self._dua)
239247

248+
def get_home_path(self):
249+
if self.home_path is None:
250+
self.home_path = util.home_path()
251+
return self.home_path
252+
253+
def get_download_path(self):
254+
if self.download_path is None:
255+
self.download_path = Path(self.get_home_path()) / 'downloads'
256+
self.download_path.parent.mkdir(parents=True, exist_ok=True)
257+
return self.download_path
258+
240259
def __getitem__(self, key):
241260
dlc = self.contents()[key]
242261
sources = []
@@ -248,11 +267,10 @@ def __getitem__(self, key):
248267
cache_path = dlc['cache_path']
249268
if 'url' in dlc:
250269
if not dlc.get('skip_local') and dlc.get('expected_md5'):
251-
local_path = Path(util.home_path()) / 'downloads' / dlc['expected_md5']
252-
local_path.parent.mkdir(parents=True, exist_ok=True)
270+
local_path = Path(self.get_download_path()) / dlc['expected_md5']
253271
local_msg = (f'If you have a local copy of {dlc["url"]}, you can symlink it here '
254272
f'to avoid downloading it again: {local_path}')
255-
sources.append(LocalDownload(local_path, local_msg))
273+
sources.append(LocalDownload(local_path, local_msg, mkdir=False))
256274
sources.append(RequestsDownload(dlc['url']))
257275
elif 'instructions' in dlc:
258276
if 'cache_path' in dlc:
@@ -266,3 +284,4 @@ def __getitem__(self, key):
266284

267285

268286
DownloadConfig = _DownloadConfig(file='etc/downloads.yaml')
287+
DownloadConfig_CM = _DownloadConfig(file='etc/clirmatrix_downloads.json', parser="json")

0 commit comments

Comments
 (0)