aol-ia

seanmacavaney · seanmacavaney · commit 94f97be390ca · 2021-10-24T20:51:26.000+01:00
diff --git a/ir_datasets/__init__.py b/ir_datasets/__init__.py
@@ -61,6 +61,10 @@ def docpairs_parent_id(dataset_id: str) -> str:
     return _parent_id(dataset_id, 'docpairs')
 
 
+def qlogs_parent_id(dataset_id: str) -> str:
+    return _parent_id(dataset_id, 'qlogs')
+
+
 def create_dataset(docs_tsv=None, queries_tsv=None, qrels_trec=None):
     LocalDownload = util.LocalDownload
     TsvDocs = formats.TsvDocs
diff --git a/ir_datasets/commands/__init__.py b/ir_datasets/commands/__init__.py
@@ -5,7 +5,6 @@
 from . import build_clueweb_warc_indexes
 from . import build_download_cache
 from . import build_c4_checkpoints
-from . import aol_doc_downloader
 from . import clean
 
 COMMANDS = {
diff --git a/ir_datasets/datasets/aol_ia.py b/ir_datasets/datasets/aol_ia.py
@@ -120,7 +120,13 @@ def _build_docs(self):
         if self._internal_docs_store().built():
             return
         if not (self._base_path/'downloaded_docs'/'_done').exists():
-            raise RuntimeError('Download docs using download.py in aolia-tools')
+            raise RuntimeError('''To use the documents of AOLIA, you will need to run the download script in https://github.com/terrierteam/aolia-tools. To run the script, use the following commands:
+
+git clone https://github.com/terrierteam/aolia-tools
+cd aolia-tools
+pip install -r requirements.txt
+python downloader.py
+''')
         LZ4FrameFile = ir_datasets.lazy_libs.lz4_frame().frame.LZ4FrameFile
         with _logger.pbar_raw(desc='', total=1525535) as pbar, self._internal_docs_store().lookup.transaction() as transaction:
             for file in sorted((self._base_path/'downloaded_docs').glob('*.jsonl.lz4')):
@@ -155,8 +161,8 @@ def build(self):
                     for line in fin:
                         pbar.update()
                         cols = line.decode().rstrip('\n').split('\t')
-                        if len(cols) == 3:
-                            user_id, query, query_time = cols
+                        if tuple(cols[3:]) == ('', ''):
+                            user_id, query, query_time, _, _ = cols
                             rank, url = None, None
                         else:
                             user_id, query, query_time, rank, url = cols
@@ -166,7 +172,6 @@ def build(self):
                             f_queries.write(f'{query_id}\t{norm_query}\n')
                             encountered_qids.add(query_id)
                         log_items = []
-                        # session_id = sessionizer.next_session_id(query_id, norm_query, user_id)
                         if url is not None:
                             doc_id = md5(url.encode()).hexdigest()[:DID_LEN]
                             f_qrels.write(f'{query_id}\t{user_id}\t{doc_id}\t1\n')
diff --git a/ir_datasets/datasets/tripclick.py b/ir_datasets/datasets/tripclick.py
@@ -92,25 +92,6 @@ def scoreddocs_cls(self):
         return self._scoreddocs[0].scoreddocs_cls()
 
 
-class LogItem(NamedTuple):
-    doc_id: str
-    clicked: bool
-
-
-class TripClickQlog(NamedTuple):
-    session_id: str
-    query_id: str
-    query: str
-    query_orig: str
-    time: datetime
-    items: Tuple[LogItem, ...]
-
-class TripClickPartialDoc(NamedTuple):
-    doc_id: str
-    title: str
-    url: str
-
-
 def ws_tok(s):
     s = re.sub('[^A-Za-z0-9 ]', ' ', s)
     left = 0
@@ -123,39 +104,6 @@ def ws_tok(s):
         yield s[left:len(s)]
 
 
-class TripClickQlogs:
-    def __init__(self, dlc):
-        self.dlc = dlc
-
-    def qlogs_iter(self):
-        for file in sorted(Path(self.dlc.path()).glob('logs/*.json')):
-            with file.open('rt') as fin:
-                for line in fin:
-                    record = json.loads(line)
-                    time = re.match(r'^/Date\(([0-9]+)\)/$', record['DateCreated']).group(1)
-                    query_norm = record['Keywords'].replace('AND', ' ').replace('OR', ' ').replace('title:', ' ')
-                    query_norm = ' '.join(ws_tok(query_norm))
-                    items = [LogItem(str(did), did == record['DocumentId']) for did in record['Documents']]
-                    if record['DocumentId'] and not any(i.clicked for i in items):
-                        items += [LogItem(str(record['DocumentId']), True)]
-                    yield TripClickQlog(
-                        record['SessionId'],
-                        hashlib.md5(query_norm.encode()).hexdigest()[:Q_HASH_LEN],
-                        query_norm,
-                        record['Keywords'],
-                        datetime.fromtimestamp(int(time)/1000),
-                        tuple(items)
-                    )
-
-    def qlogs_handler(self):
-        return self
-
-    def qlogs_cls(self):
-        return TripClickQlog
-
-    def qlogs_count(self):
-        return 5_317_350
-
 
 class DocPairGenerator:
     def __init__(self, docpair_dlc, collection, queries, cache_path):
@@ -202,49 +150,6 @@ def stream(self):
         with open(self.path(), 'rb') as f:
             yield f
 
-# The allarticles.txt file (tsv) has a couple of problems, stemming from the fact that titles
-# can include \t and \n characters. This class corrects these problems. It also removed the
-# first (header) line and the final line ("(5196956 rows affected)"), and corrects a few strange
-# things with some URLs.
-class FixAllarticles:
-    def __init__(self, streamer):
-        self._streamer = streamer
-
-    def stream(self):
-        return io.BufferedReader(IterStream(iter(self)), buffer_size=io.DEFAULT_BUFFER_SIZE)
-
-    def __iter__(self):
-        with self._streamer.stream() as stream, \
-             _logger.pbar_raw(desc='fixing allarticles.txt', unit='B', unit_scale=True) as pbar:
-            # NOTE: codecs.getreader is subtly broken here; it sometimes splits lines between special characters (and it's unclear why)
-            next(stream) # remove header
-            did, title, url = None, [], None
-            for line in stream:
-                pbar.update(len(line))
-                line = line.decode().strip()
-                if line == '' or line == '(5196956 rows affected)':
-                    continue
-                cols = line.split('\t')
-                if did is None:
-                    did = cols[0]
-                    assert did.isnumeric(), line
-                    cols = cols[1:]
-                if did in ('9283014', '11088688', '11114797'): # a few special cases where the URL is actually missing. If we don't fix this here, we'll end up messing up subsequent records
-                    yield ('\t'.join([did,  ' '.join(cols), '']) + '\n').encode()
-                    did, title, url = None, [], None
-                if len(cols) > 0:
-                    if cols[-1].startswith('_http'): # some URLs have this strange prefix, remove
-                        cols[-1] = cols[-1][1:]
-                    if cols[-1].startswith('ttp://'):
-                        cols[-1] = 'h' + cols[-1]
-                    if cols[-1].startswith('http') or cols[-1].startswith('file:///C:'):
-                        title += cols[:-1]
-                        url = cols[-1]
-                        yield ('\t'.join([did, ' '.join(title).strip(), url]) + '\n').encode()
-                        did, title, url = None, [], None
-                    else:
-                        title += cols
-
 
 def _init():
     subsets = {}
@@ -256,18 +161,11 @@ def _init():
     topics_and_qrels = TarExtractAll(dlc['benchmark'], base_path/"topics_and_qrels", path_globs=['**/topics.*.txt', '**/qrels.*.txt'])
     val_runs = TarExtractAll(dlc['dlfiles'], base_path/"val_runs", path_globs=['**/run.trip.BM25.*.val.txt'])
     test_runs = TarExtractAll(dlc['dlfiles_runs_test'], base_path/"test_runs", path_globs=['**/run.trip.BM25.*.test.txt'])
-    qlogs = TripClickQlogs(TarExtractAll(dlc['logs'], base_path/'logs', path_globs=['**/*.json']))
 
     base = Dataset(
         collection,
-        qlogs,
         documentation('_'))
 
-    subsets['log'] = Dataset(
-        TsvDocs(Cache(FixAllarticles(TarExtract(dlc['logs'], 'logs/allarticles.txt')), base_path/'logs'/'allarticles-fixed.tsv'), doc_cls=TripClickPartialDoc, lang='en'),
-        qlogs,
-        documentation('logs'))
-
     ### Train
 
     subsets['train/head'] = Dataset(
diff --git a/ir_datasets/docs/aol-ia.yaml b/ir_datasets/docs/aol-ia.yaml
@@ -0,0 +1,34 @@
+_:
+  pretty_name: 'AOL-IA (Internet Archive)'
+  desc: '
+<p>
+This is a version of the AOL Query Log. Documents use versions that appeared around the time
+of the query log (early 2006) via the Internet Archive.
+</p>
+<p>
+The query log does not include document or query IDs. These are instead created by ir_datasets.
+Document IDs are assigned using a hash of the URL that appears in the query log. Query IDs are
+assigned using the a hash of the noramlised query. All unique normalized queries are available
+from <kbd>queries</kdb>, and all clicked documents are available from <kbd>qrels</kdb> (iteration
+value set to the user ID). Full information (including original query) are available from
+<kbd>qlogs</kdb>.
+</p>
+'
+  bibtex_ids: ['Pass2006Picture']
+  docs_instructions: &inst "docs available using aolia-tools package"
+  data_access: '
+<p>
+To use the documents of this dataset, you will need to run the download
+script in <a href="https://github.com/terrierteam/aolia-tools">aolia-tools</a>.
+To run the script, use the following commands:
+</p>
+<code>
+git clone https://github.com/terrierteam/aolia-tools<br/>
+cd aolia-tools<br/>
+pip install -r requirements.txt<br/>
+python downloader.py<br/>
+</code>
+<p>
+It takes around 2 days to download all documents.
+</p>
+'
diff --git a/ir_datasets/docs/bibliography.bib b/ir_datasets/docs/bibliography.bib
@@ -684,3 +684,10 @@ @article{Bonifacio2021MMarco
     year={2021},
     journal={arXiv:2108.13897}
 }
+
+@inproceedings{Pass2006Picture,
+  title={A picture of search},
+  author={Pass, Greg and Chowdhury, Abdur and Torgeson, Cayley},
+  booktitle={InfoScale},
+  year={2006}
+}
diff --git a/test/integration/aol_ia.py b/test/integration/aol_ia.py
@@ -0,0 +1,42 @@
+import re
+import unittest
+import datetime
+import ir_datasets
+from ir_datasets.formats import GenericQuery, TrecQrel
+from ir_datasets.datasets.aol_ia import AolIaDoc, AolQlog, LogItem
+from .base import DatasetIntegrationTest
+
+
+class TestAolIa(DatasetIntegrationTest):
+    def test_docs(self):
+        self._test_docs('aol-ia', count=1525586, items={
+            0: AolIaDoc('00002a94464d', 'Alchimie Forever skin care for men and women', re.compile("^require \\( `` include/config\\.php '' \\) ; if \\( \\$ PHPSESSID \\) session_start \\( \\$ PHPSESSID \\) ; else sessi.{328} Terms and conditions © 2005 Alchimie Forever Sàrl \\. All rights reserved \\. Design : Agence Virtuelle$", flags=48), 'http://www.alchimie-forever.com', 'https://web.archive.org/web/20060218092031/http://www.alchimie-forever.com:80/'),
+            9: AolIaDoc('00007d6c3dd3', 'Pinehurst Tea Room & Caterering', re.compile('^We have had visitors \\. Welcome to Pinehurst Tea Room \\. This beautifully restored Victorian house is .{456}n please contact Lynda Dubbs at 770\\-474\\-7997 or feel free to email her at pinehursttearoom @ aol\\.com$', flags=48), 'http://www.pinehursttearoom.com', 'https://web.archive.org/web/20060209164740/http://www.pinehursttearoom.com:80/'),
+            1525585: AolIaDoc('fffff6b18440', 'Golf School - Arizona Golf School , Florida Golf School , Calfornia Golf School', '', 'http://lvgolfschools.com', 'https://web.archive.org/web/20060211025934/http://www.lvgolfschools.com:80/'),
+        })
+
+    def test_queries(self):
+        self._test_queries('aol-ia', count=9966939, items={
+            0: GenericQuery('8c418e7c9e5993', 'rentdirect com'),
+            9: GenericQuery('c8476c36af8761', 'www elaorg'),
+            9966938: GenericQuery('bba88dc56436eb', 'c21curabba'),
+        })
+
+    def test_qrels(self):
+        self._test_qrels('aol-ia', count=19442629, items={
+            0: TrecQrel('50aa67fe786ca7', '430d8aa747a3', 1, '142'),
+            9: TrecQrel('f6eff9e0848e2d', 'ecd6d884243b', 1, '217'),
+            19442628: TrecQrel('14c1b5b54212ad', 'a114f6d94af0', 1, '24967361'),
+        })
+
+    def test_qlog(self):
+        self._test_qlogs('aol-ia', count=36389567, items={
+            0: AolQlog('142', '8c418e7c9e5993', 'rentdirect com', 'rentdirect.com', datetime.datetime(2006, 3, 1, 7, 17, 12), ()),
+            6: AolQlog('142', '50aa67fe786ca7', 'westchester gov', 'westchester.gov', datetime.datetime(2006, 3, 20, 3, 55, 57), (LogItem('430d8aa747a3', '1', True),)),
+            9: AolQlog('142', 'b52c96bea30646', 'dfdf', 'dfdf', datetime.datetime(2006, 3, 24, 22, 23, 14), ()),
+            36389566: AolQlog('24969339', 'a03587795a216c', 'free credit report', 'free credit report', datetime.datetime(2006, 5, 31, 0, 42, 17), ()),
+        })
+
+
+if __name__ == '__main__':
+    unittest.main()
diff --git a/test/integration/base.py b/test/integration/base.py
@@ -90,19 +90,19 @@ def _test_qrels(self, dataset_name, count=None, items=None):
 
             self.assertEqual(0, len(items))
 
-    def _test_scoreddocs(self, dataset_name, count=None, items=None):
-        with self.subTest('scoreddocs', dataset=dataset_name):
+    def _test_qlogs(self, dataset_name, count=None, items=None):
+        with self.subTest('qlogs', dataset=dataset_name):
             if isinstance(dataset_name, str):
                 dataset = ir_datasets.load(dataset_name)
             else:
                 dataset = dataset_name
             expected_count = count
             items = items or {}
             count = 0
-            for i, scoreddoc in enumerate(_logger.pbar(dataset.scoreddocs_iter(), f'{dataset_name} scoreddocs', unit='scoreddoc')):
+            for i, qlogs in enumerate(_logger.pbar(dataset.qlogs_iter(), f'{dataset_name} qlogs', unit='qlog')):
                 count += 1
                 if i in items:
-                    self._assert_namedtuple(scoreddoc, items[i])
+                    self._assert_namedtuple(qlogs, items[i])
                     del items[i]
                     if expected_count is None and len(items) == 0:
                         break # no point in going further
@@ -211,7 +211,19 @@ def _build_test_docpairs(self, dataset_name):
                 items[i] = docpair
         items[count-1] = docpair
         _logger.info(f'''
-self._test_docpairs(i{repr(dataset_name)}, count={count}, items={self._repr_namedtuples(items)})
+self._test_docpairs({repr(dataset_name)}, count={count}, items={self._repr_namedtuples(items)})
+''')
+
+    def _build_test_qlogs(self, dataset_name):
+        items = {}
+        count = 0
+        for i, qlog in enumerate(_logger.pbar(ir_datasets.load(dataset_name).qlogs_iter(), f'{dataset_name} qlogs', unit='qlogs')):
+            count += 1
+            if i in (0, 9):
+                items[i] = qlog
+        items[count-1] = qlog
+        _logger.info(f'''
+self._test_qlogs({repr(dataset_name)}, count={count}, items={self._repr_namedtuples(items)})
 ''')
 
     def _assert_namedtuple(self, a, b):