predict command factored out of serve to run batch stdin->stdout prediction

joernhees · joernhees · commit 58dce32b28d1 · 2017-12-11T16:53:44.000+01:00
diff --git a/cluster.py b/cluster.py
@@ -435,3 +435,28 @@ def select_best_variant(variant_max_k_prec_loss_reps, log_top_k=1):
         ])
     )
     return prec_loss, k, vn, reps
+
+
+def cluster_gps_to_reduce_queries(
+        gps, max_queries, gtp_scores, clustering_variant=None):
+    if 0 < max_queries < len(gps):
+        logger.info(
+            'reducing amount of queries from %d down to %d ...',
+            len(gps), max_queries
+        )
+        gtps = gtp_scores.ground_truth_pairs
+        var_max_k_prec_loss_reps = expected_precision_loss_by_query_reduction(
+            gps, gtps, [max_queries], gtp_scores,
+            variants=[clustering_variant] if clustering_variant else None,
+        )
+        prec_loss, k, vn, reps = select_best_variant(var_max_k_prec_loss_reps)
+
+        logger.info(
+            'reduced number of queries from %d to %d\n'
+            'used variant: %s\n'
+            'expected precision sum loss ratio: %0.3f '
+            '(precision sum loss: %.2f)',
+            len(gps), len(reps), vn, prec_loss, prec_loss * gtp_scores.score
+        )
+        gps = reps
+    return gps
diff --git a/gp_learner.py b/gp_learner.py
@@ -41,7 +41,7 @@
 
 import logging_config
 from cluster import expected_precision_loss_by_query_reduction
-from cluster import select_best_variant
+from cluster import cluster_gps_to_reduce_queries
 import config
 from exception import GPLearnerAbortException
 from fusion import fuse_prediction_results
@@ -94,6 +94,10 @@
 signal.signal(signal.SIGUSR1, log_mem_usage)
 
 
+def init_workers():
+    parallel_map(_init_workers, range(1000))
+
+
 def _init_workers(_):
     # dummy method that makes workers load all import and config
     pass
@@ -1631,7 +1635,7 @@ def main(
     print(u'encoding check: äöüß\U0001F385')  # printing unicode string
 
     # init workers
-    parallel_map(_init_workers, range(1000))
+    init_workers()
 
     timer_start = datetime.utcnow()
     main_start = timer_start
@@ -1738,30 +1742,9 @@ def main(
         sys.stdout.flush()
         sys.stderr.flush()
 
-
-    if 0 < max_queries < len(gps):
-        print(
-            'reducing amount of queries from %d down to %d ...' % (
-                len(gps), max_queries)
-        )
-        sys.stdout.flush()
-        var_max_k_prec_loss_reps = expected_precision_loss_by_query_reduction(
-            gps, semantic_associations, [max_queries], gtp_scores,
-            variants=[clustering_variant] if clustering_variant else None,
-        )
-        prec_loss, k, vn, reps = select_best_variant(var_max_k_prec_loss_reps)
-        sys.stderr.flush()
-        print('reduced number of queries from %d to %d' % (len(gps), len(reps)))
-        print('used variant: %s' % vn)
-        print(
-            'expected precision sum loss ratio: %0.3f '
-            '(precision sum loss: %.2f)' % (
-                prec_loss, prec_loss * gtp_scores.score)
-        )
-        gps = reps
-
-        sys.stdout.flush()
-        sys.stderr.flush()
+    # reduce gps by clustering if mandated by max_queries
+    gps = cluster_gps_to_reduce_queries(
+        gps, max_queries, gtp_scores, clustering_variant)
 
     if print_query_patterns:
         print(
diff --git a/predict.py b/predict.py
@@ -0,0 +1,208 @@
+# -*- coding: utf-8 -*-
+from __future__ import absolute_import
+from __future__ import division
+from __future__ import print_function
+
+"""Script to predict with a fully trained model.
+
+Reads one source (TTL syntax) per line from stdin and writes one JSON line to
+stdout.
+"""
+
+import json
+import logging
+import sys
+
+import SPARQLWrapper
+from rdflib.util import from_n3
+
+
+# noinspection PyUnresolvedReferences
+import logging_config
+
+# not all import on top due to scoop and init...
+
+logger = logging.getLogger(__name__)
+
+
+def predict(sparql, timeout, gps, source,
+            fusion_methods=None, max_results=0, max_target_candidates_per_gp=0):
+    from fusion import fuse_prediction_results
+    from gp_learner import predict_target_candidates
+
+    gp_tcs = predict_target_candidates(sparql, timeout, gps, source)
+    fused_results = fuse_prediction_results(
+        gps,
+        gp_tcs,
+        fusion_methods
+    )
+    orig_length = max([len(v) for k, v in fused_results.items()])
+    if max_results > 0:
+        for k, v in fused_results.items():
+            del v[max_results:]
+    mt = max_target_candidates_per_gp
+    if mt < 1:
+        mt = None
+    # logger.info(gp_tcs)
+    res = {
+        'source': source,
+        'orig_result_length': orig_length,
+        'graph_pattern_target_candidates': [sorted(tcs)[:mt] for tcs in gp_tcs],
+        'fused_results': fused_results,
+    }
+    return res
+
+
+def parse_args():
+    import argparse
+
+    parser = argparse.ArgumentParser(
+        description='gp learner prediction',
+        formatter_class=argparse.ArgumentDefaultsHelpFormatter,
+    )
+
+    parser.add_argument(
+        "--sparql_endpoint",
+        help="the SPARQL endpoint to query",
+        action="store",
+        default=config.SPARQL_ENDPOINT,
+    )
+    parser.add_argument(
+        "--max_queries",
+        help="limits the amount of queries per prediction (0: no limit). "
+             "You want to use the same limit as in training for late fusion "
+             "models.",
+        action="store",
+        type=int,
+        default=100,
+    )
+    parser.add_argument(
+        "--clustering_variant",
+        help="if specified use this clustering variant for query reduction, "
+             "otherwise select the best from various.",
+        action="store",
+        type=str,
+        default=None,
+    )
+    parser.add_argument(
+        "--fusion_methods",
+        help="Which fusion methods to use. During prediction, each of "
+             "the learned patterns can generate a list of target candidates. "
+             "Fusion re-combines these into a single ranked list of "
+             "predicted targets. By default this will use all "
+             "implemented fusion methods. Any of them, or a ',' delimited list "
+             "can be used to reduce the output (just make sure you ran "
+             "--predict=train_set on them before). Also supports 'basic' and "
+             "'classifier' as shorthands. Make sure to only select methods the "
+             "selected model was also trained on!",
+        action="store",
+        type=str,
+        default=None,
+    )
+
+    parser.add_argument(
+        "--timeout",
+        help="sets the timeout in seconds for each query (0: auto calibrate)",
+        action="store",
+        type=float,
+        default=2.,
+    )
+    parser.add_argument(
+        "--max_results",
+        help="limits the result list lengths to save bandwidth (0: no limit)",
+        action="store",
+        type=int,
+        default=100,
+    )
+    parser.add_argument(
+        "--max_target_candidates_per_gp",
+        help="limits the target candidate list lengths to save bandwidth "
+             "(0: no limit)",
+        action="store",
+        type=int,
+        default=100,
+    )
+
+    parser.add_argument(
+        "resdir",
+        help="result directory of the trained model (overrides --RESDIR)",
+        action="store",
+    )
+
+
+    cfg_group = parser.add_argument_group(
+        'Advanced config overrides',
+        'The following allow overriding default values from config/defaults.py'
+    )
+    config.arg_parse_config_vars(cfg_group)
+
+    prog_args = vars(parser.parse_args())
+    # the following were aliased above, make sure they're updated globally
+    prog_args.update({
+        'SPARQL_ENDPOINT': prog_args['sparql_endpoint'],
+        'RESDIR': prog_args['resdir'],
+    })
+    config.finalize(prog_args)
+
+    return prog_args
+
+
+
+def main(
+        resdir,
+        sparql_endpoint,
+        max_queries,
+        clustering_variant,
+        fusion_methods,
+        timeout,
+        max_results,
+        max_target_candidates_per_gp,
+        **_  # gulp remaining kwargs
+):
+    from gp_query import calibrate_query_timeout
+    from serialization import load_results
+    from serialization import find_last_result
+    from cluster import cluster_gps_to_reduce_queries
+    from gp_learner import init_workers
+
+    # init workers
+    init_workers()
+
+    sparql = SPARQLWrapper.SPARQLWrapper(sparql_endpoint)
+    timeout = timeout if timeout > 0 else calibrate_query_timeout(sparql)
+
+    # load model
+    last_res = find_last_result()
+    if not last_res:
+        logger.error('cannot find fully trained model in %s', resdir)
+        sys.exit(1)
+    result_patterns, coverage_counts, gtp_scores = load_results(last_res)
+    gps = [gp for gp, _ in result_patterns]
+    gps = cluster_gps_to_reduce_queries(
+        gps, max_queries, gtp_scores, clustering_variant)
+
+    # main loop
+    for line in sys.stdin:
+        line = line.strip()
+        if not line:
+            continue
+        if line[0] not in '<"':
+            logger.error(
+                'expected inputs to start with < or ", but got: %s', line)
+            sys.exit(1)
+        source = from_n3(line)
+
+        res = predict(
+            sparql, timeout, gps, source, fusion_methods,
+            max_results, max_target_candidates_per_gp
+        )
+        print(json.dumps(res))
+
+
+if __name__ == "__main__":
+    logger.info('init run: origin')
+    import config
+    prog_kwds = parse_args()
+    main(**prog_kwds)
+else:
+    logger.info('init run: worker')
diff --git a/serve.py b/serve.py
@@ -81,32 +81,12 @@ def predict():
 
 
 def _predict(source):
-    from fusion import fuse_prediction_results
-    from gp_learner import predict_target_candidates
     from gp_query import calibrate_query_timeout
-
+    from predict import predict
     timeout = TIMEOUT if TIMEOUT > 0 else calibrate_query_timeout(SPARQL)
-    gp_tcs = predict_target_candidates(SPARQL, timeout, GPS, source)
-    fused_results = fuse_prediction_results(
-        GPS,
-        gp_tcs,
-        FUSION_METHODS
-    )
-    orig_length = max([len(v) for k, v in fused_results.items()])
-    if MAX_RESULTS > 0:
-        for k, v in fused_results.items():
-            del v[MAX_RESULTS:]
-    mt = MAX_TARGET_CANDIDATES_PER_GP
-    if mt < 1:
-        mt = None
-    # logger.info(gp_tcs)
-    res = {
-        'source': source,
-        'orig_result_length': orig_length,
-        'graph_pattern_target_candidates': [sorted(tcs)[:mt] for tcs in gp_tcs],
-        'fused_results': fused_results,
-    }
-    return res
+    return predict(
+        SPARQL, timeout, GPS, source,
+        FUSION_METHODS, MAX_RESULTS, MAX_TARGET_CANDIDATES_PER_GP)
 
 
 @app.route("/api/feedback", methods=["POST"])