You signed in with another tab or window. Reload to refresh your session.You signed out in another tab or window. Reload to refresh your session.You switched accounts on another tab or window. Reload to refresh your session.Dismiss alert
raiseRuntimeError('Download docs using download.py in aolia-tools')
123
+
raiseRuntimeError('''To use the documents of AOLIA, you will need to run the download script in https://github.com/terrierteam/aolia-tools. To run the script, use the following commands:
# NOTE: codecs.getreader is subtly broken here; it sometimes splits lines between special characters (and it's unclear why)
220
-
next(stream) # remove header
221
-
did, title, url=None, [], None
222
-
forlineinstream:
223
-
pbar.update(len(line))
224
-
line=line.decode().strip()
225
-
ifline==''orline=='(5196956 rows affected)':
226
-
continue
227
-
cols=line.split('\t')
228
-
ifdidisNone:
229
-
did=cols[0]
230
-
assertdid.isnumeric(), line
231
-
cols=cols[1:]
232
-
ifdidin ('9283014', '11088688', '11114797'): # a few special cases where the URL is actually missing. If we don't fix this here, we'll end up messing up subsequent records
9: AolIaDoc('00007d6c3dd3', 'Pinehurst Tea Room & Caterering', re.compile('^We have had visitors \\. Welcome to Pinehurst Tea Room \\. This beautifully restored Victorian house is .{456}n please contact Lynda Dubbs at 770\\-474\\-7997 or feel free to email her at pinehursttearoom @ aol\\.com$', flags=48), 'http://www.pinehursttearoom.com', 'https://web.archive.org/web/20060209164740/http://www.pinehursttearoom.com:80/'),
15
+
1525585: AolIaDoc('fffff6b18440', 'Golf School - Arizona Golf School , Florida Golf School , Calfornia Golf School', '', 'http://lvgolfschools.com', 'https://web.archive.org/web/20060211025934/http://www.lvgolfschools.com:80/'),
0 commit comments