Skip to content

Commit 94f97be

Browse files
committed
aol-ia
1 parent 44bd6b2 commit 94f97be

File tree

8 files changed

+113
-112
lines changed

8 files changed

+113
-112
lines changed

ir_datasets/__init__.py

Lines changed: 4 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -61,6 +61,10 @@ def docpairs_parent_id(dataset_id: str) -> str:
6161
return _parent_id(dataset_id, 'docpairs')
6262

6363

64+
def qlogs_parent_id(dataset_id: str) -> str:
65+
return _parent_id(dataset_id, 'qlogs')
66+
67+
6468
def create_dataset(docs_tsv=None, queries_tsv=None, qrels_trec=None):
6569
LocalDownload = util.LocalDownload
6670
TsvDocs = formats.TsvDocs

ir_datasets/commands/__init__.py

Lines changed: 0 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -5,7 +5,6 @@
55
from . import build_clueweb_warc_indexes
66
from . import build_download_cache
77
from . import build_c4_checkpoints
8-
from . import aol_doc_downloader
98
from . import clean
109

1110
COMMANDS = {

ir_datasets/datasets/aol_ia.py

Lines changed: 9 additions & 4 deletions
Original file line numberDiff line numberDiff line change
@@ -120,7 +120,13 @@ def _build_docs(self):
120120
if self._internal_docs_store().built():
121121
return
122122
if not (self._base_path/'downloaded_docs'/'_done').exists():
123-
raise RuntimeError('Download docs using download.py in aolia-tools')
123+
raise RuntimeError('''To use the documents of AOLIA, you will need to run the download script in https://github.com/terrierteam/aolia-tools. To run the script, use the following commands:
124+
125+
git clone https://github.com/terrierteam/aolia-tools
126+
cd aolia-tools
127+
pip install -r requirements.txt
128+
python downloader.py
129+
''')
124130
LZ4FrameFile = ir_datasets.lazy_libs.lz4_frame().frame.LZ4FrameFile
125131
with _logger.pbar_raw(desc='', total=1525535) as pbar, self._internal_docs_store().lookup.transaction() as transaction:
126132
for file in sorted((self._base_path/'downloaded_docs').glob('*.jsonl.lz4')):
@@ -155,8 +161,8 @@ def build(self):
155161
for line in fin:
156162
pbar.update()
157163
cols = line.decode().rstrip('\n').split('\t')
158-
if len(cols) == 3:
159-
user_id, query, query_time = cols
164+
if tuple(cols[3:]) == ('', ''):
165+
user_id, query, query_time, _, _ = cols
160166
rank, url = None, None
161167
else:
162168
user_id, query, query_time, rank, url = cols
@@ -166,7 +172,6 @@ def build(self):
166172
f_queries.write(f'{query_id}\t{norm_query}\n')
167173
encountered_qids.add(query_id)
168174
log_items = []
169-
# session_id = sessionizer.next_session_id(query_id, norm_query, user_id)
170175
if url is not None:
171176
doc_id = md5(url.encode()).hexdigest()[:DID_LEN]
172177
f_qrels.write(f'{query_id}\t{user_id}\t{doc_id}\t1\n')

ir_datasets/datasets/tripclick.py

Lines changed: 0 additions & 102 deletions
Original file line numberDiff line numberDiff line change
@@ -92,25 +92,6 @@ def scoreddocs_cls(self):
9292
return self._scoreddocs[0].scoreddocs_cls()
9393

9494

95-
class LogItem(NamedTuple):
96-
doc_id: str
97-
clicked: bool
98-
99-
100-
class TripClickQlog(NamedTuple):
101-
session_id: str
102-
query_id: str
103-
query: str
104-
query_orig: str
105-
time: datetime
106-
items: Tuple[LogItem, ...]
107-
108-
class TripClickPartialDoc(NamedTuple):
109-
doc_id: str
110-
title: str
111-
url: str
112-
113-
11495
def ws_tok(s):
11596
s = re.sub('[^A-Za-z0-9 ]', ' ', s)
11697
left = 0
@@ -123,39 +104,6 @@ def ws_tok(s):
123104
yield s[left:len(s)]
124105

125106

126-
class TripClickQlogs:
127-
def __init__(self, dlc):
128-
self.dlc = dlc
129-
130-
def qlogs_iter(self):
131-
for file in sorted(Path(self.dlc.path()).glob('logs/*.json')):
132-
with file.open('rt') as fin:
133-
for line in fin:
134-
record = json.loads(line)
135-
time = re.match(r'^/Date\(([0-9]+)\)/$', record['DateCreated']).group(1)
136-
query_norm = record['Keywords'].replace('AND', ' ').replace('OR', ' ').replace('title:', ' ')
137-
query_norm = ' '.join(ws_tok(query_norm))
138-
items = [LogItem(str(did), did == record['DocumentId']) for did in record['Documents']]
139-
if record['DocumentId'] and not any(i.clicked for i in items):
140-
items += [LogItem(str(record['DocumentId']), True)]
141-
yield TripClickQlog(
142-
record['SessionId'],
143-
hashlib.md5(query_norm.encode()).hexdigest()[:Q_HASH_LEN],
144-
query_norm,
145-
record['Keywords'],
146-
datetime.fromtimestamp(int(time)/1000),
147-
tuple(items)
148-
)
149-
150-
def qlogs_handler(self):
151-
return self
152-
153-
def qlogs_cls(self):
154-
return TripClickQlog
155-
156-
def qlogs_count(self):
157-
return 5_317_350
158-
159107

160108
class DocPairGenerator:
161109
def __init__(self, docpair_dlc, collection, queries, cache_path):
@@ -202,49 +150,6 @@ def stream(self):
202150
with open(self.path(), 'rb') as f:
203151
yield f
204152

205-
# The allarticles.txt file (tsv) has a couple of problems, stemming from the fact that titles
206-
# can include \t and \n characters. This class corrects these problems. It also removed the
207-
# first (header) line and the final line ("(5196956 rows affected)"), and corrects a few strange
208-
# things with some URLs.
209-
class FixAllarticles:
210-
def __init__(self, streamer):
211-
self._streamer = streamer
212-
213-
def stream(self):
214-
return io.BufferedReader(IterStream(iter(self)), buffer_size=io.DEFAULT_BUFFER_SIZE)
215-
216-
def __iter__(self):
217-
with self._streamer.stream() as stream, \
218-
_logger.pbar_raw(desc='fixing allarticles.txt', unit='B', unit_scale=True) as pbar:
219-
# NOTE: codecs.getreader is subtly broken here; it sometimes splits lines between special characters (and it's unclear why)
220-
next(stream) # remove header
221-
did, title, url = None, [], None
222-
for line in stream:
223-
pbar.update(len(line))
224-
line = line.decode().strip()
225-
if line == '' or line == '(5196956 rows affected)':
226-
continue
227-
cols = line.split('\t')
228-
if did is None:
229-
did = cols[0]
230-
assert did.isnumeric(), line
231-
cols = cols[1:]
232-
if did in ('9283014', '11088688', '11114797'): # a few special cases where the URL is actually missing. If we don't fix this here, we'll end up messing up subsequent records
233-
yield ('\t'.join([did, ' '.join(cols), '']) + '\n').encode()
234-
did, title, url = None, [], None
235-
if len(cols) > 0:
236-
if cols[-1].startswith('_http'): # some URLs have this strange prefix, remove
237-
cols[-1] = cols[-1][1:]
238-
if cols[-1].startswith('ttp://'):
239-
cols[-1] = 'h' + cols[-1]
240-
if cols[-1].startswith('http') or cols[-1].startswith('file:///C:'):
241-
title += cols[:-1]
242-
url = cols[-1]
243-
yield ('\t'.join([did, ' '.join(title).strip(), url]) + '\n').encode()
244-
did, title, url = None, [], None
245-
else:
246-
title += cols
247-
248153

249154
def _init():
250155
subsets = {}
@@ -256,18 +161,11 @@ def _init():
256161
topics_and_qrels = TarExtractAll(dlc['benchmark'], base_path/"topics_and_qrels", path_globs=['**/topics.*.txt', '**/qrels.*.txt'])
257162
val_runs = TarExtractAll(dlc['dlfiles'], base_path/"val_runs", path_globs=['**/run.trip.BM25.*.val.txt'])
258163
test_runs = TarExtractAll(dlc['dlfiles_runs_test'], base_path/"test_runs", path_globs=['**/run.trip.BM25.*.test.txt'])
259-
qlogs = TripClickQlogs(TarExtractAll(dlc['logs'], base_path/'logs', path_globs=['**/*.json']))
260164

261165
base = Dataset(
262166
collection,
263-
qlogs,
264167
documentation('_'))
265168

266-
subsets['log'] = Dataset(
267-
TsvDocs(Cache(FixAllarticles(TarExtract(dlc['logs'], 'logs/allarticles.txt')), base_path/'logs'/'allarticles-fixed.tsv'), doc_cls=TripClickPartialDoc, lang='en'),
268-
qlogs,
269-
documentation('logs'))
270-
271169
### Train
272170

273171
subsets['train/head'] = Dataset(

ir_datasets/docs/aol-ia.yaml

Lines changed: 34 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,34 @@
1+
_:
2+
pretty_name: 'AOL-IA (Internet Archive)'
3+
desc: '
4+
<p>
5+
This is a version of the AOL Query Log. Documents use versions that appeared around the time
6+
of the query log (early 2006) via the Internet Archive.
7+
</p>
8+
<p>
9+
The query log does not include document or query IDs. These are instead created by ir_datasets.
10+
Document IDs are assigned using a hash of the URL that appears in the query log. Query IDs are
11+
assigned using the a hash of the noramlised query. All unique normalized queries are available
12+
from <kbd>queries</kdb>, and all clicked documents are available from <kbd>qrels</kdb> (iteration
13+
value set to the user ID). Full information (including original query) are available from
14+
<kbd>qlogs</kdb>.
15+
</p>
16+
'
17+
bibtex_ids: ['Pass2006Picture']
18+
docs_instructions: &inst "docs available using aolia-tools package"
19+
data_access: '
20+
<p>
21+
To use the documents of this dataset, you will need to run the download
22+
script in <a href="https://github.com/terrierteam/aolia-tools">aolia-tools</a>.
23+
To run the script, use the following commands:
24+
</p>
25+
<code>
26+
git clone https://github.com/terrierteam/aolia-tools<br/>
27+
cd aolia-tools<br/>
28+
pip install -r requirements.txt<br/>
29+
python downloader.py<br/>
30+
</code>
31+
<p>
32+
It takes around 2 days to download all documents.
33+
</p>
34+
'

ir_datasets/docs/bibliography.bib

Lines changed: 7 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -684,3 +684,10 @@ @article{Bonifacio2021MMarco
684684
year={2021},
685685
journal={arXiv:2108.13897}
686686
}
687+
688+
@inproceedings{Pass2006Picture,
689+
title={A picture of search},
690+
author={Pass, Greg and Chowdhury, Abdur and Torgeson, Cayley},
691+
booktitle={InfoScale},
692+
year={2006}
693+
}

test/integration/aol_ia.py

Lines changed: 42 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,42 @@
1+
import re
2+
import unittest
3+
import datetime
4+
import ir_datasets
5+
from ir_datasets.formats import GenericQuery, TrecQrel
6+
from ir_datasets.datasets.aol_ia import AolIaDoc, AolQlog, LogItem
7+
from .base import DatasetIntegrationTest
8+
9+
10+
class TestAolIa(DatasetIntegrationTest):
11+
def test_docs(self):
12+
self._test_docs('aol-ia', count=1525586, items={
13+
0: AolIaDoc('00002a94464d', 'Alchimie Forever skin care for men and women', re.compile("^require \\( `` include/config\\.php '' \\) ; if \\( \\$ PHPSESSID \\) session_start \\( \\$ PHPSESSID \\) ; else sessi.{328} Terms and conditions © 2005 Alchimie Forever Sàrl \\. All rights reserved \\. Design : Agence Virtuelle$", flags=48), 'http://www.alchimie-forever.com', 'https://web.archive.org/web/20060218092031/http://www.alchimie-forever.com:80/'),
14+
9: AolIaDoc('00007d6c3dd3', 'Pinehurst Tea Room & Caterering', re.compile('^We have had visitors \\. Welcome to Pinehurst Tea Room \\. This beautifully restored Victorian house is .{456}n please contact Lynda Dubbs at 770\\-474\\-7997 or feel free to email her at pinehursttearoom @ aol\\.com$', flags=48), 'http://www.pinehursttearoom.com', 'https://web.archive.org/web/20060209164740/http://www.pinehursttearoom.com:80/'),
15+
1525585: AolIaDoc('fffff6b18440', 'Golf School - Arizona Golf School , Florida Golf School , Calfornia Golf School', '', 'http://lvgolfschools.com', 'https://web.archive.org/web/20060211025934/http://www.lvgolfschools.com:80/'),
16+
})
17+
18+
def test_queries(self):
19+
self._test_queries('aol-ia', count=9966939, items={
20+
0: GenericQuery('8c418e7c9e5993', 'rentdirect com'),
21+
9: GenericQuery('c8476c36af8761', 'www elaorg'),
22+
9966938: GenericQuery('bba88dc56436eb', 'c21curabba'),
23+
})
24+
25+
def test_qrels(self):
26+
self._test_qrels('aol-ia', count=19442629, items={
27+
0: TrecQrel('50aa67fe786ca7', '430d8aa747a3', 1, '142'),
28+
9: TrecQrel('f6eff9e0848e2d', 'ecd6d884243b', 1, '217'),
29+
19442628: TrecQrel('14c1b5b54212ad', 'a114f6d94af0', 1, '24967361'),
30+
})
31+
32+
def test_qlog(self):
33+
self._test_qlogs('aol-ia', count=36389567, items={
34+
0: AolQlog('142', '8c418e7c9e5993', 'rentdirect com', 'rentdirect.com', datetime.datetime(2006, 3, 1, 7, 17, 12), ()),
35+
6: AolQlog('142', '50aa67fe786ca7', 'westchester gov', 'westchester.gov', datetime.datetime(2006, 3, 20, 3, 55, 57), (LogItem('430d8aa747a3', '1', True),)),
36+
9: AolQlog('142', 'b52c96bea30646', 'dfdf', 'dfdf', datetime.datetime(2006, 3, 24, 22, 23, 14), ()),
37+
36389566: AolQlog('24969339', 'a03587795a216c', 'free credit report', 'free credit report', datetime.datetime(2006, 5, 31, 0, 42, 17), ()),
38+
})
39+
40+
41+
if __name__ == '__main__':
42+
unittest.main()

test/integration/base.py

Lines changed: 17 additions & 5 deletions
Original file line numberDiff line numberDiff line change
@@ -90,19 +90,19 @@ def _test_qrels(self, dataset_name, count=None, items=None):
9090

9191
self.assertEqual(0, len(items))
9292

93-
def _test_scoreddocs(self, dataset_name, count=None, items=None):
94-
with self.subTest('scoreddocs', dataset=dataset_name):
93+
def _test_qlogs(self, dataset_name, count=None, items=None):
94+
with self.subTest('qlogs', dataset=dataset_name):
9595
if isinstance(dataset_name, str):
9696
dataset = ir_datasets.load(dataset_name)
9797
else:
9898
dataset = dataset_name
9999
expected_count = count
100100
items = items or {}
101101
count = 0
102-
for i, scoreddoc in enumerate(_logger.pbar(dataset.scoreddocs_iter(), f'{dataset_name} scoreddocs', unit='scoreddoc')):
102+
for i, qlogs in enumerate(_logger.pbar(dataset.qlogs_iter(), f'{dataset_name} qlogs', unit='qlog')):
103103
count += 1
104104
if i in items:
105-
self._assert_namedtuple(scoreddoc, items[i])
105+
self._assert_namedtuple(qlogs, items[i])
106106
del items[i]
107107
if expected_count is None and len(items) == 0:
108108
break # no point in going further
@@ -211,7 +211,19 @@ def _build_test_docpairs(self, dataset_name):
211211
items[i] = docpair
212212
items[count-1] = docpair
213213
_logger.info(f'''
214-
self._test_docpairs(i{repr(dataset_name)}, count={count}, items={self._repr_namedtuples(items)})
214+
self._test_docpairs({repr(dataset_name)}, count={count}, items={self._repr_namedtuples(items)})
215+
''')
216+
217+
def _build_test_qlogs(self, dataset_name):
218+
items = {}
219+
count = 0
220+
for i, qlog in enumerate(_logger.pbar(ir_datasets.load(dataset_name).qlogs_iter(), f'{dataset_name} qlogs', unit='qlogs')):
221+
count += 1
222+
if i in (0, 9):
223+
items[i] = qlog
224+
items[count-1] = qlog
225+
_logger.info(f'''
226+
self._test_qlogs({repr(dataset_name)}, count={count}, items={self._repr_namedtuples(items)})
215227
''')
216228

217229
def _assert_namedtuple(self, a, b):

0 commit comments

Comments
 (0)