13
13
import json
14
14
import logging
15
15
import sys
16
+ import utils
17
+ import time
16
18
17
19
import SPARQLWrapper
18
20
from splendid import chunker
@@ -118,6 +120,13 @@ def parse_args():
118
120
type = str ,
119
121
default = None ,
120
122
)
123
+ parser .add_argument (
124
+ "--drop_bad_uris" ,
125
+ help = "URIs that cannot be curified are ignored" ,
126
+ action = "store" ,
127
+ type = bool ,
128
+ default = False ,
129
+ )
121
130
parser .add_argument (
122
131
"--fusion_methods" ,
123
132
help = "Which fusion methods to use. During prediction, each of "
@@ -198,6 +207,7 @@ def main(
198
207
max_results ,
199
208
max_target_candidates_per_gp ,
200
209
batch_predict ,
210
+ drop_bad_uris ,
201
211
** _ # gulp remaining kwargs
202
212
):
203
213
from gp_query import calibrate_query_timeout
@@ -222,6 +232,8 @@ def main(
222
232
gps = cluster_gps_to_reduce_queries (
223
233
gps , max_queries , gtp_scores , clustering_variant )
224
234
235
+ processed = 0
236
+ start = time .time ()
225
237
batch_size = config .BATCH_SIZE if batch_predict else 1
226
238
# main loop
227
239
for lines in chunker (sys .stdin , batch_size ):
@@ -230,6 +242,13 @@ def main(
230
242
line = line .strip ()
231
243
if not line :
232
244
continue
245
+ if drop_bad_uris :
246
+ try :
247
+ source = from_n3 (line )
248
+ utils .curify (source )
249
+ except :
250
+ logger .warning ('Warning: Could not curify URI %s! Skip.' , line )
251
+ continue
233
252
if line [0 ] not in '<"' :
234
253
logger .error (
235
254
'expected inputs to start with < or ", but got: %s' , line )
@@ -238,7 +257,9 @@ def main(
238
257
batch .append (source )
239
258
batch = list (OrderedDict .fromkeys (batch ))
240
259
241
- if len (batch ) == 1 :
260
+ if len (batch ) == 0 :
261
+ pass
262
+ elif len (batch ) == 1 :
242
263
res = predict (
243
264
sparql , timeout , gps , batch [0 ], fusion_methods ,
244
265
max_results , max_target_candidates_per_gp
@@ -252,6 +273,8 @@ def main(
252
273
for r in res :
253
274
print (json .dumps (r ))
254
275
276
+ processed += len (batch )
277
+ logger .info ('Have processed %d URIs now. Took %s sec' , processed , time .time ()- start )
255
278
256
279
if __name__ == "__main__" :
257
280
logger .info ('init run: origin' )
0 commit comments