|
| 1 | +import logging |
| 2 | + |
| 3 | +import numpy as np |
| 4 | +import pandas as pd |
| 5 | +from asreviewcontrib.preprocess.base import BasePipeline |
| 6 | + |
| 7 | + |
| 8 | +class Filtr: |
| 9 | + """Class for creating filters for record pairs using similarity features |
| 10 | +
|
| 11 | + The filter can be a standalone filter, which is applied to a single |
| 12 | + feature, such filter can be created using the "add" method giving |
| 13 | + 'feature' and 'threshold' as arguments. |
| 14 | + Another option is to create combined filters, which are combined either |
| 15 | + using "AND" or using "OR" operators on the results of individual |
| 16 | + feature filters. |
| 17 | + """ |
| 18 | + |
| 19 | + def __init__(self): |
| 20 | + self.filters = [] |
| 21 | + self.filter_type = None |
| 22 | + |
| 23 | + def add(self, feature, threshold): |
| 24 | + """Add (feature,threshold) to a standalone filter""" |
| 25 | + self.filters.append((feature, threshold)) |
| 26 | + self.filter_type = "standalone" |
| 27 | + |
| 28 | + def add_multiple(self, filter_list, filter_type): |
| 29 | + if self.filter_type == "standalone": |
| 30 | + raise ValueError("You cannot add multiple filters to a standalone filter") |
| 31 | + |
| 32 | + if self.filter_type is not None and filter_type != self.filter_type: |
| 33 | + raise ValueError( |
| 34 | + f"You cannot add a '{filter_type}' filter to a '{self.filter_type}' filter" |
| 35 | + ) |
| 36 | + |
| 37 | + self.filter_type = filter_type |
| 38 | + self.filters.extend(filter_list) |
| 39 | + |
| 40 | + |
| 41 | +class FilterPipeline(BasePipeline): |
| 42 | + """Class for creating filter pipeline for record pairs using similarity features""" |
| 43 | + |
| 44 | + def __init__(self): |
| 45 | + super(FilterPipeline, self).__init__() |
| 46 | + |
| 47 | + def add(self, name: str, filtr: Filtr) -> None: |
| 48 | + """Add a filter to the pipeline""" |
| 49 | + self._pipeline.append((name, filtr)) |
| 50 | + |
| 51 | + def apply_pipe(self, pairs_df: pd.DataFrame) -> pd.DataFrame: |
| 52 | + """Apply filters in the pipeline and combine their results by union |
| 53 | +
|
| 54 | + Parameters |
| 55 | + ---------- |
| 56 | + pairs_df : pd.DataFrame |
| 57 | + Pairs dataframe created by indexing (blocking) with similarity features |
| 58 | + """ |
| 59 | + if len(self._pipeline) == 0: |
| 60 | + raise ValueError( |
| 61 | + "You need to add filters to the pipeline using 'add' method " |
| 62 | + "before using apply_pipe" |
| 63 | + ) |
| 64 | + |
| 65 | + results = np.zeros(pairs_df.shape[0]) |
| 66 | + for name, filtr in self._pipeline: |
| 67 | + |
| 68 | + logging.info(f"Applying {name} filter of type {filtr.filter_type}...") |
| 69 | + if filtr.filter_type == "standalone": |
| 70 | + temp = (pairs_df[filtr.filters[0][0]] >= filtr.filters[0][1]).values |
| 71 | + |
| 72 | + if filtr.filter_type == "and": |
| 73 | + temp = np.ones(pairs_df.shape[0]) |
| 74 | + for feature, threshold in filtr.filters: |
| 75 | + temp *= (pairs_df[feature] >= threshold).values |
| 76 | + |
| 77 | + if filtr.filter_type == "or": |
| 78 | + temp = np.zeros(pairs_df.shape[0]) |
| 79 | + for feature, threshold in filtr.filters: |
| 80 | + temp += (pairs_df[feature] >= threshold).values |
| 81 | + |
| 82 | + results += temp |
| 83 | + |
| 84 | + return pairs_df.loc[np.where(results)[0]] |
0 commit comments