Skip to content

Commit fe53276

Browse files
metadata (allenai#132)
* wip, computing metadata for datasets (so far, hash and count) includes a command line utility to generate the metadata allenai#66 * some metadata * metadata * much faster gov and gov2 processing * base qlogs class * metadata refactoring * better metadata file formatting * clueweb doc metadata * providing *_count via metadata if not yet available also making more *_count functions lazy * metadata updates * removed a global numpy import to improve package load time * finished clirmatrix metadata * defer to metadata to provide docstore count hints * beta python api support * minor metadata file formatting * fixed integration bugs for c4 and tweets2013-ia * added args.me metadata
1 parent 5012ca4 commit fe53276

Some content is hidden

Large Commits have some content hidden by default. Use the searchbox below for content that may be hidden.

61 files changed

+1249
-441
lines changed

.github/workflows/push.yml

Lines changed: 2 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -45,12 +45,12 @@ jobs:
4545
if: matrix.os == 'ubuntu-latest' || matrix.os == 'macOs-latest'
4646
run: |
4747
pip install pytest
48-
pytest test/util.py test/integration/dummy.py test/integration/vaswani.py test/formats/
48+
pytest test/util.py test/metadata.py test/integration/dummy.py test/integration/vaswani.py test/formats/
4949
- name: Test-windows with pytest
5050
if: matrix.os == 'windows-latest'
5151
shell: cmd
5252
run: |
5353
pip install pytest
54-
pytest test\util.py test\integration\dummy.py test\integration\vaswani.py test\formats\
54+
pytest test\util.py test\metadata.py test\integration\dummy.py test\integration\vaswani.py test\formats\
5555
env:
5656
PATH: 'C:/Program Files/zlib/bin/'

ir_datasets/__init__.py

Lines changed: 18 additions & 10 deletions
Original file line numberDiff line numberDiff line change
@@ -1,3 +1,11 @@
1+
from enum import Enum
2+
class EntityType(Enum):
3+
docs = "docs"
4+
queries = "queries"
5+
qrels = "qrels"
6+
scoreddocs = "scoreddocs"
7+
docpairs = "docpairs"
8+
qlogs = "qlogs"
19
from . import lazy_libs
210
from . import log
311
from . import util
@@ -10,12 +18,11 @@
1018

1119
Dataset = datasets.base.Dataset
1220

13-
1421
def load(name):
1522
return registry[name]
1623

1724

18-
def _parent_id(dataset_id: str, entity_type: str) -> str:
25+
def parent_id(dataset_id: str, entity_type: EntityType) -> str:
1926
"""
2027
Maps a dataset_id to a more general ID that shares the same entity handler (e.g., docs_handler). For example,
2128
for docs, "msmarco-document/trec-dl-2019/judged" -> "msmarco-document" or "wikir/en1k/test" -> "wikir/en1k".
@@ -25,44 +32,45 @@ def _parent_id(dataset_id: str, entity_type: str) -> str:
2532
hierarchy that has the same docs_handler instance. This function may be updated in the future to
2633
also use explicit links added when datasets are registered.
2734
"""
35+
entity_type = EntityType(entity_type) # validate & allow strings
2836
ds = load(dataset_id)
2937
segments = dataset_id.split("/")
30-
handler = getattr(ds, f'{entity_type}_handler')()
38+
handler = getattr(ds, f'{entity_type.value}_handler')()
3139
parent_ds_id = dataset_id
3240
while len(segments) > 1:
3341
segments.pop()
3442
try:
3543
parent_ds = load("/".join(segments))
36-
if getattr(parent_ds, f'has_{entity_type}')() and getattr(parent_ds, f'{entity_type}_handler')() == handler:
44+
if parent_ds.has(entity_type.value) and getattr(parent_ds, f'{entity_type.value}_handler')() == handler:
3745
parent_ds_id = "/".join(segments)
3846
except KeyError:
3947
pass # this dataset doesn't exist
4048
return parent_ds_id
4149

4250

4351
def docs_parent_id(dataset_id: str) -> str:
44-
return _parent_id(dataset_id, 'docs')
52+
return parent_id(dataset_id, EntityType.docs)
4553
corpus_id = docs_parent_id # legacy
4654

4755

4856
def queries_parent_id(dataset_id: str) -> str:
49-
return _parent_id(dataset_id, 'queries')
57+
return parent_id(dataset_id, EntityType.queries)
5058

5159

5260
def qrels_parent_id(dataset_id: str) -> str:
53-
return _parent_id(dataset_id, 'qrels')
61+
return parent_id(dataset_id, EntityType.qrels)
5462

5563

5664
def scoreddocs_parent_id(dataset_id: str) -> str:
57-
return _parent_id(dataset_id, 'scoreddocs')
65+
return parent_id(dataset_id, EntityType.scoreddocs)
5866

5967

6068
def docpairs_parent_id(dataset_id: str) -> str:
61-
return _parent_id(dataset_id, 'docpairs')
69+
return parent_id(dataset_id, EntityType.docpairs)
6270

6371

6472
def qlogs_parent_id(dataset_id: str) -> str:
65-
return _parent_id(dataset_id, 'qlogs')
73+
return parent_id(dataset_id, EntityType.qlogs)
6674

6775

6876
def create_dataset(docs_tsv=None, queries_tsv=None, qrels_trec=None):

ir_datasets/commands/__init__.py

Lines changed: 2 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -6,6 +6,7 @@
66
from . import build_download_cache
77
from . import build_c4_checkpoints
88
from . import clean
9+
from . import generate_metadata
910

1011
COMMANDS = {
1112
'doc_fifos': doc_fifos.main,
@@ -16,4 +17,5 @@
1617
'build_c4_checkpoints': build_c4_checkpoints.main,
1718
'build_download_cache': build_download_cache.main,
1819
'clean': clean.main,
20+
'generate_metadata': generate_metadata.main,
1921
}
Lines changed: 92 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,92 @@
1+
import time
2+
import sys
3+
import os
4+
import json
5+
import argparse
6+
from pathlib import Path
7+
from fnmatch import fnmatch
8+
import ir_datasets
9+
from ir_datasets.util import DownloadConfig
10+
11+
12+
_logger = ir_datasets.log.easy()
13+
14+
15+
def dataset2metadata(args):
16+
dsid, data = args
17+
try:
18+
dataset = ir_datasets.load(dsid)
19+
except KeyError:
20+
return dsid, None
21+
try:
22+
for e in ir_datasets.EntityType:
23+
if dataset.has(e):
24+
if e.value not in data:
25+
parent_id = getattr(ir_datasets, f'{e.value}_parent_id')(dsid)
26+
if parent_id != dsid:
27+
data[e.value] = {'_ref': parent_id}
28+
else:
29+
with _logger.duration(f'{dsid} {e.value}'):
30+
data[e.value] = getattr(dataset, f'{e.value}_calc_metadata')()
31+
_logger.info(f'{dsid} {e.value}: {data[e.value]}')
32+
except Exception as ex:
33+
_logger.info(f'{dsid} {e.value} [error]: {ex}')
34+
return dsid, None
35+
return dsid, data
36+
37+
38+
def write_metadata_file(data, file):
39+
with file.open('wt') as f:
40+
# partially-formatted data; one dataset per line
41+
f.write('{\n')
42+
for i, key in enumerate(sorted(data.keys())):
43+
if i != 0:
44+
f.write(',\n')
45+
f.write(f' "{key}": {json.dumps(data[key])}')
46+
f.write('\n}\n')
47+
48+
49+
def main(args):
50+
parser = argparse.ArgumentParser(prog='ir_datasets generate_metadata', description='Generates metadata for the specified datasets')
51+
parser.add_argument('--file', help='output file', type=Path, default=Path('ir_datasets/etc/metadata.json'))
52+
parser.add_argument('--datasets', nargs='+', help='dataset IDs for which to compute metadata. If omitted, generates for all datasets present in the registry (skipping patterns)')
53+
54+
args = parser.parse_args(args)
55+
if args.file.is_file():
56+
with args.file.open('rb') as f:
57+
data = json.load(f)
58+
else:
59+
data = {}
60+
61+
if args.datasets:
62+
def _ds_iter():
63+
for dsid in args.datasets:
64+
yield dsid, data.get(dsid, {})
65+
import multiprocessing
66+
with multiprocessing.Pool(10) as pool:
67+
for dsid, dataset_metadata in _logger.pbar(pool.imap_unordered(dataset2metadata, _ds_iter()), desc='datasets', total=len(args.datasets)):
68+
if dataset_metadata is not None:
69+
data[dsid] = dataset_metadata
70+
write_metadata_file(data, args.file)
71+
else:
72+
for dsid in ir_datasets.registry._registered:
73+
dataset = ir_datasets.load(dsid)
74+
brk = False
75+
try:
76+
_, dataset_metadata = dataset2metadata((dsid, data.get(dsid, {})))
77+
if dataset_metadata is not None:
78+
data[dsid] = dataset_metadata
79+
except KeyboardInterrupt:
80+
_logger.info(f'KeyboardInterrupt; skipping. ctrl+c within 0.5sec to stop compute_metadata.')
81+
try:
82+
time.sleep(0.5)
83+
except KeyboardInterrupt:
84+
brk = True
85+
break
86+
write_metadata_file(data, args.file)
87+
if brk:
88+
break
89+
90+
91+
if __name__ == '__main__':
92+
main(sys.argv[1:])

ir_datasets/datasets/antique.py

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -33,7 +33,7 @@ def _init():
3333
documentation = YamlDocumentation('docs/antique.yaml')
3434
base_path = ir_datasets.util.home_path() / NAME
3535
dlc = DownloadConfig.context(NAME, base_path, dua=DUA)
36-
collection = TsvDocs(dlc['docs'], namespace=NAME, lang='en', count_hint=403_666)
36+
collection = TsvDocs(dlc['docs'], namespace=NAME, lang='en', count_hint=ir_datasets.util.count_hint(NAME))
3737

3838
subsets = {}
3939
for subset in ('train', 'test'):

ir_datasets/datasets/aol_ia.py

Lines changed: 6 additions & 8 deletions
Original file line numberDiff line numberDiff line change
@@ -8,7 +8,7 @@
88
import ir_datasets
99
from typing import NamedTuple, Tuple
1010
from ir_datasets.util import DownloadConfig, GzipExtract, TarExtract, finialized_file
11-
from ir_datasets.formats import TrecQrels, TsvQueries, DocstoreBackedDocs
11+
from ir_datasets.formats import TrecQrels, TsvQueries, DocstoreBackedDocs, BaseQlogs
1212
from ir_datasets.datasets.base import Dataset, YamlDocumentation
1313

1414
_logger = ir_datasets.log.easy()
@@ -45,7 +45,7 @@ class AolIaDoc(NamedTuple):
4545
ia_url: str
4646

4747

48-
class AolQlogs:
48+
class AolQlogs(BaseQlogs):
4949
def __init__(self, dlc):
5050
self.dlc = dlc
5151

@@ -59,9 +59,6 @@ def qlogs_iter(self):
5959
except EOFError:
6060
pass
6161

62-
def qlogs_handler(self):
63-
return self
64-
6562
def qlogs_cls(self):
6663
return AolQlog
6764

@@ -80,8 +77,9 @@ def stream(self):
8077
with open(self._path, 'rb') as f:
8178
yield f
8279

83-
def path(self):
84-
self._manager.build()
80+
def path(self, force=True):
81+
if force:
82+
self._manager.build()
8583
return self._path
8684

8785

@@ -101,7 +99,7 @@ def docs_store(self):
10199

102100
def _internal_docs_store(self):
103101
if self._docs_store is None:
104-
self._docs_store = ir_datasets.indices.PickleLz4FullStore(self._base_path/'docs.pklz4', None, AolIaDoc, 'doc_id', ['doc_id'], count_hint=1525535)
102+
self._docs_store = ir_datasets.indices.PickleLz4FullStore(self._base_path/'docs.pklz4', None, AolIaDoc, 'doc_id', ['doc_id'], count_hint=ir_datasets.util.count_hint(NAME))
105103
return self._docs_store
106104

107105
def _build_docs(self):

ir_datasets/datasets/aquaint.py

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -26,7 +26,7 @@ def _init():
2626
dlc = DownloadConfig.context(NAME, base_path)
2727
documentation = YamlDocumentation(f'docs/{NAME}.yaml')
2828

29-
collection = TrecDocs(dlc['docs'], encoding='utf8', path_globs=['aquaint_comp/apw/*/*.gz', 'aquaint_comp/nyt/*/*.gz', 'aquaint_comp/xie/*/*.gz'], namespace=NAME, lang='en', count_hint=1033461)
29+
collection = TrecDocs(dlc['docs'], encoding='utf8', path_globs=['aquaint_comp/apw/*/*.gz', 'aquaint_comp/nyt/*/*.gz', 'aquaint_comp/xie/*/*.gz'], namespace=NAME, lang='en', count_hint=ir_datasets.util.count_hint(NAME))
3030

3131
base = Dataset(collection, documentation('_'))
3232

0 commit comments

Comments
 (0)