ssun32
diff --git a/‎examples/adding_datasets.ipynb
Lines changed: 20 additions & 19 deletions b/‎examples/adding_datasets.ipynb
Lines changed: 20 additions & 19 deletions
diff --git a/‎ir_datasets/commands/build_download_cache.py
Lines changed: 3 additions & 3 deletions b/‎ir_datasets/commands/build_download_cache.py
Lines changed: 3 additions & 3 deletions
diff --git a/‎ir_datasets/datasets/dpr_w100.py
Lines changed: 1 addition & 1 deletion b/‎ir_datasets/datasets/dpr_w100.py
Lines changed: 1 addition & 1 deletion
diff --git a/‎ir_datasets/datasets/highwire.py
Lines changed: 3 additions & 3 deletions b/‎ir_datasets/datasets/highwire.py
Lines changed: 3 additions & 3 deletions
diff --git a/‎ir_datasets/datasets/medline.py
Lines changed: 1 addition & 1 deletion b/‎ir_datasets/datasets/medline.py
Lines changed: 1 addition & 1 deletion
diff --git a/‎ir_datasets/datasets/msmarco_passage.py
Lines changed: 1 addition & 1 deletion b/‎ir_datasets/datasets/msmarco_passage.py
Lines changed: 1 addition & 1 deletion
diff --git a/‎ir_datasets/datasets/msmarco_qna.py
Lines changed: 1 addition & 1 deletion b/‎ir_datasets/datasets/msmarco_qna.py
Lines changed: 1 addition & 1 deletion
@@ -19,7 +19,7 @@
     "\n",
     "There are four files involved in adding a dataset to the `ir_datasets` package:\n",
     " - `ir_datasets/datasets/[dataset-id].py` - Contains the definition of the dataset and any specialized code for handling it.\n",
-    " - `ir_datasets/etc/downloads.yaml` - Contains information about how to download and verify dataset source files.\n",
+    " - `ir_datasets/etc/downloads.json` - Contains information about how to download and verify dataset source files.\n",
     " - `ir_datasets/docs/[dataset-id].yaml` - Contains documentation of the dataset.\n",
     " - `test/integration/[dataset-id].py` - Contains automated tests to ensure the dataset is processed as expected.\n",
     " \n",
@@ -93,27 +93,28 @@
    "cell_type": "markdown",
    "metadata": {},
    "source": [
-    "File: `ir_datasets/etc/downloads.yaml`\n",
+    "File: `ir_datasets/etc/downloads.json`\n",
     "\n",
     "(add lines like these to the file)\n",
     "\n",
-    "```yaml\n",
-    "dummy: # should match the NAME above\n",
-    "  docs: # the key to get this download reference from dlc[] above\n",
-    "    # URL to download\n",
-    "    url: 'https://raw.githubusercontent.com/seanmacavaney/dummy-irds-ext/master/data/docs.tsv'\n",
-    "    # to verify the integrity of the download (hint: use md5sum)\n",
-    "    expected_md5: 'c7bb5a1a3a07d51de50e8414245c2be4'\n",
-    "    # where to store the file (under base_path)\n",
-    "    cache_path: 'docs.tsv'\n",
-    "  queries:\n",
-    "    url: 'https://raw.githubusercontent.com/seanmacavaney/dummy-irds-ext/master/data/queries.tsv'\n",
-    "    expected_md5: '08ba86d990cbe6890f727946346964db'\n",
-    "    cache_path: 'queries.tsv'\n",
-    "  qrels:\n",
-    "    url: 'https://raw.githubusercontent.com/seanmacavaney/dummy-irds-ext/master/data/qrels'\n",
-    "    expected_md5: '79ed359fe0afa0f67eb39f468d162920'\n",
-    "    cache_path: 'qrels'\n",
+    "```json\n",
+    "\"dummy\": {\n",
+    "  \"docs\": {\n",
+    "    \"url\": \"https://raw.githubusercontent.com/seanmacavaney/dummy-irds-ext/master/data/docs.tsv\",\n",
+    "    \"expected_md5\": \"c7bb5a1a3a07d51de50e8414245c2be4\",\n",
+    "    \"cache_path\": \"docs.tsv\"\n",
+    "  },\n",
+    "  \"queries\": {\n",
+    "    \"url\": \"https://raw.githubusercontent.com/seanmacavaney/dummy-irds-ext/master/data/queries.tsv\",\n",
+    "    \"expected_md5\": \"08ba86d990cbe6890f727946346964db\",\n",
+    "    \"cache_path\": \"queries.tsv\"\n",
+    "  },\n",
+    "  \"qrels\": {\n",
+    "    \"url\": \"https://raw.githubusercontent.com/seanmacavaney/dummy-irds-ext/master/data/qrels\",\n",
+    "    \"expected_md5\": \"79ed359fe0afa0f67eb39f468d162920\",\n",
+    "    \"cache_path\": \"qrels\"\n",
+    "  }\n",
+    "}\n",
     "```"
    ]
   },
 
@@ -3,7 +3,7 @@
 import io
 import os
 import argparse
-import yaml
+import json
 from contextlib import contextmanager
 import ir_datasets
 
@@ -63,8 +63,8 @@ def main(args):
     parser.add_argument('--retries', default='10')
     args = parser.parse_args(args)
 
-    with open('ir_datasets/etc/downloads.yaml') as f:
-        data = yaml.load(f, Loader=yaml.BaseLoader)
+    with open('ir_datasets/etc/downloads.json') as f:
+        data = json.load(f)
     with tmp_environ(IR_DATASETS_DL_TRIES=args.retries):
         _build_cache(data, args.dir)
 
 
@@ -1,5 +1,4 @@
 from typing import NamedTuple, Tuple
-import ijson
 import contextlib
 import itertools
 import ir_datasets
@@ -41,6 +40,7 @@ def __init__(self, dlc, base_path, passage_id_key='passage_id'):
         self._passage_id_key = passage_id_key
 
     def build(self):
+        ijson = ir_datasets.lazy_libs.ijson()
         if (self._base_path/'queries.tsv').exists():
             return # already built
 
 
@@ -1,7 +1,6 @@
 import codecs
 from typing import NamedTuple, Tuple
 from zipfile import ZipFile
-import lxml.html
 import ir_datasets
 from ir_datasets.util import DownloadConfig
 from ir_datasets.formats import BaseDocs, BaseQueries, GenericQuery, BaseQrels
@@ -64,6 +63,7 @@ def docs_iter(self):
         return iter(self.docs_store())
 
     def _docs_iter(self):
+        lxml_html = ir_datasets.lazy_libs.lxml_html()
         def _legalspans_iter():
             with self._legalspans_dlc.stream() as f:
                 prev_did, spans = None, None
@@ -86,11 +86,11 @@ def _legalspans_iter():
                     assert legalspans_did == doc_id
                     spans = tuple(HighwireSpan(s, l, doc_raw[s:s+l]) for s, l in legalspans)
                     # the title should be in the first span inside a <h2> element
-                    title = lxml.html.document_fromstring(b'<OUTER>' + spans[0].text + b'</OUTER>')
+                    title = lxml_html.document_fromstring(b'<OUTER>' + spans[0].text + b'</OUTER>')
                     title = title.xpath("//h2")
                     title = title[0].text_content() if title else ''
                     # keep just the text content within each spans
-                    spans = tuple(HighwireSpan(s, l, lxml.html.document_fromstring(b'<OUTER>' + t + b'</OUTER>').text_content()) for s, l, t in spans)
+                    spans = tuple(HighwireSpan(s, l, lxml_html.document_fromstring(b'<OUTER>' + t + b'</OUTER>').text_content()) for s, l, t in spans)
                     yield HighwireDoc(doc_id, source, title, spans)
 
     def docs_path(self):
 
@@ -6,7 +6,6 @@
 import itertools
 from typing import NamedTuple, Tuple
 import tarfile
-import xml.etree.ElementTree as ET
 import ir_datasets
 from ir_datasets.util import DownloadConfig, GzipExtract, ZipExtract
 from ir_datasets.formats import BaseDocs, BaseQueries, GenericQuery, TrecQrels, TrecXmlQueries
@@ -85,6 +84,7 @@ def __init__(self, name, dlcs):
 
     @ir_datasets.util.use_docstore
     def docs_iter(self):
+        ET = ir_datasets.lazy_libs.xml_etree()
         with ExitStack() as stack:
             if self._name == '2004':
                 # The files for 2004 are a large XML file that's split internally.
 
@@ -217,7 +217,7 @@ def train_med():
     )
 
     # DL-Hard
-    dl_hard_qrels_migrator = Migrator(base_path/'trec-dl-hard'/'irds_version.txt', 'v2',
+    dl_hard_qrels_migrator = Migrator(base_path/'trec-dl-hard'/'irds_version.txt', 'v3',
         affected_files=[base_path/'trec-dl-hard'/'qrels'],
         message='Updating trec-dl-hard qrels')
     hard_qids = Lazy(lambda: DL_HARD_QIDS)
 
@@ -6,7 +6,6 @@
 import codecs
 from typing import NamedTuple, Tuple
 import re
-import ijson
 import ir_datasets
 from ir_datasets.util import Cache, TarExtract, IterStream, GzipExtract, Lazy, DownloadConfig, Migrator
 from ir_datasets.datasets.base import Dataset, FilteredQueries, FilteredScoredDocs, FilteredQrels, FilteredDocPairs, YamlDocumentation
@@ -96,6 +95,7 @@ def _internal_docs_store(self):
         return self._docs_store
 
     def build(self):
+        ijson = ir_datasets.lazy_libs.ijson()
         docs_store = self._internal_docs_store()
         if docs_store.built():
             return # already built
Original file line number	Diff line number	Diff line change
`@@ -217,7 +217,7 @@ def train_med():`
`217`	`217`	`)`
`218`	`218`
`219`	`219`	`# DL-Hard`
`220`		`- dl_hard_qrels_migrator = Migrator(base_path/'trec-dl-hard'/'irds_version.txt', 'v2',`
	`220`	`+ dl_hard_qrels_migrator = Migrator(base_path/'trec-dl-hard'/'irds_version.txt', 'v3',`
`221`	`221`	`affected_files=[base_path/'trec-dl-hard'/'qrels'],`
`222`	`222`	`message='Updating trec-dl-hard qrels')`
`223`	`223`	`hard_qids = Lazy(lambda: DL_HARD_QIDS)`