added clirmatrix datasets

Shuo Sun · Shuo Sun · commit 6f04fc1de559 · 2021-05-20T17:09:02.000-04:00
diff --git a/ir_datasets/datasets/__init__.py b/ir_datasets/datasets/__init__.py
@@ -28,3 +28,4 @@
 from . import vaswani
 from . import wapo
 from . import wikir
+from . import clirmatrix
diff --git a/ir_datasets/datasets/clirmatrix.py b/ir_datasets/datasets/clirmatrix.py
@@ -0,0 +1,60 @@
+import contextlib
+from pathlib import Path
+from typing import NamedTuple
+import ir_datasets
+from ir_datasets.util import GzipExtract, DownloadConfig_CM
+from ir_datasets.datasets.base import Dataset, YamlDocumentation
+from ir_datasets.formats import TsvDocs, CLIRMatrixQueries, CLIRMatrixQrels
+from tqdm import tqdm
+
+NAME = 'clirmatrix'
+
+_logger = ir_datasets.log.easy()
+
+QRELS_DEFS = {
+    6: "6",
+    5: "5",
+    4: "4",
+    3: "3",
+    2: "2",
+    1: "1",
+    0: "0",
+}
+
+def _init():
+    base_path = ir_datasets.util.home_path()/NAME
+
+    documentation = YamlDocumentation(f'docs/{NAME}.yaml')
+    subsets = {}
+
+    docs = {}
+    doc_dlc = DownloadConfig_CM.context("clirmatrix_docs", base_path)
+    for k in doc_dlc.contents().keys():
+        doc_lcode = k.split("/")[-1]
+        doc = TsvDocs(GzipExtract(doc_dlc[k]), namespace=doc_lcode, lang=doc_lcode)
+        docs[doc_lcode] = doc
+
+    for dataset in ["clirmatrix_multi8", "clirmatrix_bi139_base", "clirmatrix_bi139_full"]:
+        dataset_name = dataset.split("_", 1)[-1]
+        dlc = DownloadConfig_CM.context(dataset, base_path)
+        for k in tqdm(dlc.contents().keys()):
+            _, lcodes, split = k.split("/")
+            query_lcode, doc_lcode = lcodes.split("_")
+            qrel_dlc = GzipExtract(dlc[k])
+            qrels = CLIRMatrixQrels(qrel_dlc, QRELS_DEFS)
+            queries = CLIRMatrixQueries(qrel_dlc, query_lcode)
+            subsets[f"{dataset_name}/{query_lcode}/{doc_lcode}/{split}"] = Dataset(
+                    docs[doc_lcode],
+                    qrels,
+                    queries)
+
+    base = Dataset(documentation('_'))
+
+    ir_datasets.registry.register(NAME, base)
+    for s in sorted(subsets):
+        ir_datasets.registry.register(f'{NAME}/{s}', subsets[s])
+
+    return base, subsets
+
+
+collection, subsets = _init()
diff --git a/ir_datasets/etc/clirmatrix_downloads.json b/ir_datasets/etc/clirmatrix_downloads.json
diff --git a/ir_datasets/formats/__init__.py b/ir_datasets/formats/__init__.py
@@ -5,3 +5,4 @@
 from .trec import TrecDocs, TrecQueries, TrecXmlQueries, TrecColonQueries, TrecQrels, TrecPrels, TrecScoredDocs, TrecDoc, TitleUrlTextDoc, TrecQuery, TrecSubtopic, TrecQrel, TrecPrel
 from .webarc import WarcDocs, WarcDoc
 from .ntcir import NtcirQrels
+from .clirmatrix import CLIRMatrixQueries, CLIRMatrixQrels
diff --git a/ir_datasets/formats/clirmatrix.py b/ir_datasets/formats/clirmatrix.py
@@ -0,0 +1,47 @@
+import codecs
+import json
+from . import TrecQrels, TrecQrel
+from .base import GenericQuery, BaseQueries
+
+
+class CLIRMatrixQueries(BaseQueries):
+    def __init__(self, streamer, query_lang):
+        super().__init__()
+        self._streamer = streamer
+        self.query_lang = query_lang
+
+    def queries_iter(self):
+        with self._streamer.stream() as stream:
+            f = codecs.getreader('utf-8')(stream)
+            for line in f:
+                if line == '\n':
+                    continue #ignore blank lines
+
+                j = json.loads(line)
+                qid = j["src_id"]
+                query = j["src_query"]
+                yield GenericQuery(qid, query)
+
+    def queries_namespace(self):
+        return NAME
+
+    def queries_cls(self):
+        return GenericQuery
+
+    def queries_lang(self):
+        return self.query_lang
+
+
+class CLIRMatrixQrels(TrecQrels):
+    def qrels_iter(self):
+        with self._qrels_dlc.stream() as f:
+            f = codecs.getreader('utf8')(f)
+            for line in f:
+                if line == '\n':
+                    continue # ignore blank lines
+
+                j = json.loads(line)
+
+                qid = j["src_id"]
+                for did, score in j["tgt_results"]:
+                    yield TrecQrel(qid, did, int(score), '0')
diff --git a/ir_datasets/lazy_libs.py b/ir_datasets/lazy_libs.py
@@ -37,6 +37,13 @@ def yaml():
     return _cache['yaml']
 
 
+def json():
+    if 'json' not in _cache:
+        import json
+        _cache['json'] = json
+    return _cache['json']
+
+
 def trec_car():
     if 'trec_car' not in _cache:
         import trec_car.read_data
diff --git a/ir_datasets/util/__init__.py b/ir_datasets/util/__init__.py
@@ -7,7 +7,7 @@
 from pathlib import Path
 from .. import log
 from .fileio import IterStream, Cache, TarExtract, TarExtractAll, RelativePath, GzipExtract, ZipExtract, ZipExtractCache, StringFile, ReTar, Bz2Extract
-from .download import Download, DownloadConfig, BaseDownload, RequestsDownload, LocalDownload
+from .download import Download, DownloadConfig, BaseDownload, RequestsDownload, LocalDownload, DownloadConfig_CM
 from .hash import HashVerificationError, HashVerifier, HashStream
 from .registry import Registry
 
diff --git a/ir_datasets/util/download.py b/ir_datasets/util/download.py
@@ -119,9 +119,10 @@ def __repr__(self):
 
 
 class LocalDownload(BaseDownload):
-    def __init__(self, path, message=None):
+    def __init__(self, path, message=None, mkdir=True):
         self._path = Path(path)
-        self._path.parent.mkdir(parents=True, exist_ok=True)
+        if mkdir:
+            self._path.parent.mkdir(parents=True, exist_ok=True)
         self._message = message
 
     def path(self):
@@ -220,23 +221,41 @@ def dua_ctxt(cls, dua):
 
 
 class _DownloadConfig:
-    def __init__(self, file=None, base_path=None, contents=None, dua=None):
+    def __init__(self, file=None, base_path=None, contents=None, dua=None, parser="yaml"):
         self._file = file
         self._base_path = base_path
         self._contents = contents
         self._dua = dua
+        self._parser = parser
+        self.home_path = None
+        self.download_path = None
 
     def contents(self):
         if self._contents is None:
-            yaml = ir_datasets.lazy_libs.yaml()
             data = pkgutil.get_data('ir_datasets', self._file)
-            self._contents = yaml.load(data, Loader=yaml.BaseLoader)
+            if self._parser == "json":
+                json = ir_datasets.lazy_libs.json()
+                self._contents = json.loads(data)
+            else:
+                yaml = ir_datasets.lazy_libs.yaml()
+                self._contents = yaml.load(data, Loader=yaml.BaseLoader)
         return self._contents
 
     def context(self, key, base_path=None, dua=None):
         contents = self.contents()
         return _DownloadConfig(contents=contents[key] if key else contents, base_path=base_path or self._base_path, dua=dua or self._dua)
 
+    def get_home_path(self):
+        if self.home_path is None:
+            self.home_path = util.home_path()
+        return self.home_path
+
+    def get_download_path(self):
+        if self.download_path is None:
+            self.download_path = Path(self.get_home_path()) / 'downloads'
+            self.download_path.parent.mkdir(parents=True, exist_ok=True)
+        return self.download_path
+
     def __getitem__(self, key):
         dlc = self.contents()[key]
         sources = []
@@ -248,11 +267,10 @@ def __getitem__(self, key):
                 cache_path = dlc['cache_path']
         if 'url' in dlc:
             if not dlc.get('skip_local') and dlc.get('expected_md5'):
-                local_path = Path(util.home_path()) / 'downloads' / dlc['expected_md5']
-                local_path.parent.mkdir(parents=True, exist_ok=True)
+                local_path = Path(self.get_download_path()) / dlc['expected_md5']
                 local_msg = (f'If you have a local copy of {dlc["url"]}, you can symlink it here '
                              f'to avoid downloading it again: {local_path}')
-                sources.append(LocalDownload(local_path, local_msg))
+                sources.append(LocalDownload(local_path, local_msg, mkdir=False))
             sources.append(RequestsDownload(dlc['url']))
         elif 'instructions' in dlc:
             if 'cache_path' in dlc:
@@ -266,3 +284,4 @@ def __getitem__(self, key):
 
 
 DownloadConfig = _DownloadConfig(file='etc/downloads.yaml')
+DownloadConfig_CM = _DownloadConfig(file='etc/clirmatrix_downloads.json', parser="json")