Skip to content

Commit 54dfd10

Browse files
committed
Merge remote-tracking branch 'upstream/master'
2 parents e9c656c + 65a904b commit 54dfd10

15 files changed

+2094
-57
lines changed

examples/adding_datasets.ipynb

Lines changed: 20 additions & 19 deletions
Original file line numberDiff line numberDiff line change
@@ -19,7 +19,7 @@
1919
"\n",
2020
"There are four files involved in adding a dataset to the `ir_datasets` package:\n",
2121
" - `ir_datasets/datasets/[dataset-id].py` - Contains the definition of the dataset and any specialized code for handling it.\n",
22-
" - `ir_datasets/etc/downloads.yaml` - Contains information about how to download and verify dataset source files.\n",
22+
" - `ir_datasets/etc/downloads.json` - Contains information about how to download and verify dataset source files.\n",
2323
" - `ir_datasets/docs/[dataset-id].yaml` - Contains documentation of the dataset.\n",
2424
" - `test/integration/[dataset-id].py` - Contains automated tests to ensure the dataset is processed as expected.\n",
2525
" \n",
@@ -93,27 +93,28 @@
9393
"cell_type": "markdown",
9494
"metadata": {},
9595
"source": [
96-
"File: `ir_datasets/etc/downloads.yaml`\n",
96+
"File: `ir_datasets/etc/downloads.json`\n",
9797
"\n",
9898
"(add lines like these to the file)\n",
9999
"\n",
100-
"```yaml\n",
101-
"dummy: # should match the NAME above\n",
102-
" docs: # the key to get this download reference from dlc[] above\n",
103-
" # URL to download\n",
104-
" url: 'https://raw.githubusercontent.com/seanmacavaney/dummy-irds-ext/master/data/docs.tsv'\n",
105-
" # to verify the integrity of the download (hint: use md5sum)\n",
106-
" expected_md5: 'c7bb5a1a3a07d51de50e8414245c2be4'\n",
107-
" # where to store the file (under base_path)\n",
108-
" cache_path: 'docs.tsv'\n",
109-
" queries:\n",
110-
" url: 'https://raw.githubusercontent.com/seanmacavaney/dummy-irds-ext/master/data/queries.tsv'\n",
111-
" expected_md5: '08ba86d990cbe6890f727946346964db'\n",
112-
" cache_path: 'queries.tsv'\n",
113-
" qrels:\n",
114-
" url: 'https://raw.githubusercontent.com/seanmacavaney/dummy-irds-ext/master/data/qrels'\n",
115-
" expected_md5: '79ed359fe0afa0f67eb39f468d162920'\n",
116-
" cache_path: 'qrels'\n",
100+
"```json\n",
101+
"\"dummy\": {\n",
102+
" \"docs\": {\n",
103+
" \"url\": \"https://raw.githubusercontent.com/seanmacavaney/dummy-irds-ext/master/data/docs.tsv\",\n",
104+
" \"expected_md5\": \"c7bb5a1a3a07d51de50e8414245c2be4\",\n",
105+
" \"cache_path\": \"docs.tsv\"\n",
106+
" },\n",
107+
" \"queries\": {\n",
108+
" \"url\": \"https://raw.githubusercontent.com/seanmacavaney/dummy-irds-ext/master/data/queries.tsv\",\n",
109+
" \"expected_md5\": \"08ba86d990cbe6890f727946346964db\",\n",
110+
" \"cache_path\": \"queries.tsv\"\n",
111+
" },\n",
112+
" \"qrels\": {\n",
113+
" \"url\": \"https://raw.githubusercontent.com/seanmacavaney/dummy-irds-ext/master/data/qrels\",\n",
114+
" \"expected_md5\": \"79ed359fe0afa0f67eb39f468d162920\",\n",
115+
" \"cache_path\": \"qrels\"\n",
116+
" }\n",
117+
"}\n",
117118
"```"
118119
]
119120
},

ir_datasets/commands/build_download_cache.py

Lines changed: 3 additions & 3 deletions
Original file line numberDiff line numberDiff line change
@@ -3,7 +3,7 @@
33
import io
44
import os
55
import argparse
6-
import yaml
6+
import json
77
from contextlib import contextmanager
88
import ir_datasets
99

@@ -63,8 +63,8 @@ def main(args):
6363
parser.add_argument('--retries', default='10')
6464
args = parser.parse_args(args)
6565

66-
with open('ir_datasets/etc/downloads.yaml') as f:
67-
data = yaml.load(f, Loader=yaml.BaseLoader)
66+
with open('ir_datasets/etc/downloads.json') as f:
67+
data = json.load(f)
6868
with tmp_environ(IR_DATASETS_DL_TRIES=args.retries):
6969
_build_cache(data, args.dir)
7070

ir_datasets/datasets/dpr_w100.py

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -1,5 +1,4 @@
11
from typing import NamedTuple, Tuple
2-
import ijson
32
import contextlib
43
import itertools
54
import ir_datasets
@@ -41,6 +40,7 @@ def __init__(self, dlc, base_path, passage_id_key='passage_id'):
4140
self._passage_id_key = passage_id_key
4241

4342
def build(self):
43+
ijson = ir_datasets.lazy_libs.ijson()
4444
if (self._base_path/'queries.tsv').exists():
4545
return # already built
4646

ir_datasets/datasets/highwire.py

Lines changed: 3 additions & 3 deletions
Original file line numberDiff line numberDiff line change
@@ -1,7 +1,6 @@
11
import codecs
22
from typing import NamedTuple, Tuple
33
from zipfile import ZipFile
4-
import lxml.html
54
import ir_datasets
65
from ir_datasets.util import DownloadConfig
76
from ir_datasets.formats import BaseDocs, BaseQueries, GenericQuery, BaseQrels
@@ -64,6 +63,7 @@ def docs_iter(self):
6463
return iter(self.docs_store())
6564

6665
def _docs_iter(self):
66+
lxml_html = ir_datasets.lazy_libs.lxml_html()
6767
def _legalspans_iter():
6868
with self._legalspans_dlc.stream() as f:
6969
prev_did, spans = None, None
@@ -86,11 +86,11 @@ def _legalspans_iter():
8686
assert legalspans_did == doc_id
8787
spans = tuple(HighwireSpan(s, l, doc_raw[s:s+l]) for s, l in legalspans)
8888
# the title should be in the first span inside a <h2> element
89-
title = lxml.html.document_fromstring(b'<OUTER>' + spans[0].text + b'</OUTER>')
89+
title = lxml_html.document_fromstring(b'<OUTER>' + spans[0].text + b'</OUTER>')
9090
title = title.xpath("//h2")
9191
title = title[0].text_content() if title else ''
9292
# keep just the text content within each spans
93-
spans = tuple(HighwireSpan(s, l, lxml.html.document_fromstring(b'<OUTER>' + t + b'</OUTER>').text_content()) for s, l, t in spans)
93+
spans = tuple(HighwireSpan(s, l, lxml_html.document_fromstring(b'<OUTER>' + t + b'</OUTER>').text_content()) for s, l, t in spans)
9494
yield HighwireDoc(doc_id, source, title, spans)
9595

9696
def docs_path(self):

ir_datasets/datasets/medline.py

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -6,7 +6,6 @@
66
import itertools
77
from typing import NamedTuple, Tuple
88
import tarfile
9-
import xml.etree.ElementTree as ET
109
import ir_datasets
1110
from ir_datasets.util import DownloadConfig, GzipExtract, ZipExtract
1211
from ir_datasets.formats import BaseDocs, BaseQueries, GenericQuery, TrecQrels, TrecXmlQueries
@@ -85,6 +84,7 @@ def __init__(self, name, dlcs):
8584

8685
@ir_datasets.util.use_docstore
8786
def docs_iter(self):
87+
ET = ir_datasets.lazy_libs.xml_etree()
8888
with ExitStack() as stack:
8989
if self._name == '2004':
9090
# The files for 2004 are a large XML file that's split internally.

ir_datasets/datasets/msmarco_passage.py

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -217,7 +217,7 @@ def train_med():
217217
)
218218

219219
# DL-Hard
220-
dl_hard_qrels_migrator = Migrator(base_path/'trec-dl-hard'/'irds_version.txt', 'v2',
220+
dl_hard_qrels_migrator = Migrator(base_path/'trec-dl-hard'/'irds_version.txt', 'v3',
221221
affected_files=[base_path/'trec-dl-hard'/'qrels'],
222222
message='Updating trec-dl-hard qrels')
223223
hard_qids = Lazy(lambda: DL_HARD_QIDS)

ir_datasets/datasets/msmarco_qna.py

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -6,7 +6,6 @@
66
import codecs
77
from typing import NamedTuple, Tuple
88
import re
9-
import ijson
109
import ir_datasets
1110
from ir_datasets.util import Cache, TarExtract, IterStream, GzipExtract, Lazy, DownloadConfig, Migrator
1211
from ir_datasets.datasets.base import Dataset, FilteredQueries, FilteredScoredDocs, FilteredQrels, FilteredDocPairs, YamlDocumentation
@@ -96,6 +95,7 @@ def _internal_docs_store(self):
9695
return self._docs_store
9796

9897
def build(self):
98+
ijson = ir_datasets.lazy_libs.ijson()
9999
docs_store = self._internal_docs_store()
100100
if docs_store.built():
101101
return # already built

0 commit comments

Comments
 (0)