Skip to content

Commit 82bafab

Browse files
committed
Add filter pipeline functionality
1 parent dc45a80 commit 82bafab

File tree

2 files changed

+85
-1
lines changed

2 files changed

+85
-1
lines changed
Lines changed: 84 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,84 @@
1+
import logging
2+
3+
import numpy as np
4+
import pandas as pd
5+
from asreviewcontrib.preprocess.base import BasePipeline
6+
7+
8+
class Filtr:
9+
"""Class for creating filters for record pairs using similarity features
10+
11+
The filter can be a standalone filter, which is applied to a single
12+
feature, such filter can be created using the "add" method giving
13+
'feature' and 'threshold' as arguments.
14+
Another option is to create combined filters, which are combined either
15+
using "AND" or using "OR" operators on the results of individual
16+
feature filters.
17+
"""
18+
19+
def __init__(self):
20+
self.filters = []
21+
self.filter_type = None
22+
23+
def add(self, feature, threshold):
24+
"""Add (feature,threshold) to a standalone filter"""
25+
self.filters.append((feature, threshold))
26+
self.filter_type = "standalone"
27+
28+
def add_multiple(self, filter_list, filter_type):
29+
if self.filter_type == "standalone":
30+
raise ValueError("You cannot add multiple filters to a standalone filter")
31+
32+
if self.filter_type is not None and filter_type != self.filter_type:
33+
raise ValueError(
34+
f"You cannot add a '{filter_type}' filter to a '{self.filter_type}' filter"
35+
)
36+
37+
self.filter_type = filter_type
38+
self.filters.extend(filter_list)
39+
40+
41+
class FilterPipeline(BasePipeline):
42+
"""Class for creating filter pipeline for record pairs using similarity features"""
43+
44+
def __init__(self):
45+
super(FilterPipeline, self).__init__()
46+
47+
def add(self, name: str, filtr: Filtr) -> None:
48+
"""Add a filter to the pipeline"""
49+
self._pipeline.append((name, filtr))
50+
51+
def apply_pipe(self, pairs_df: pd.DataFrame) -> pd.DataFrame:
52+
"""Apply filters in the pipeline and combine their results by union
53+
54+
Parameters
55+
----------
56+
pairs_df : pd.DataFrame
57+
Pairs dataframe created by indexing (blocking) with similarity features
58+
"""
59+
if len(self._pipeline) == 0:
60+
raise ValueError(
61+
"You need to add filters to the pipeline using 'add' method "
62+
"before using apply_pipe"
63+
)
64+
65+
results = np.zeros(pairs_df.shape[0])
66+
for name, filtr in self._pipeline:
67+
68+
logging.info(f"Applying {name} filter of type {filtr.filter_type}...")
69+
if filtr.filter_type == "standalone":
70+
temp = (pairs_df[filtr.filters[0][0]] >= filtr.filters[0][1]).values
71+
72+
if filtr.filter_type == "and":
73+
temp = np.ones(pairs_df.shape[0])
74+
for feature, threshold in filtr.filters:
75+
temp *= (pairs_df[feature] >= threshold).values
76+
77+
if filtr.filter_type == "or":
78+
temp = np.zeros(pairs_df.shape[0])
79+
for feature, threshold in filtr.filters:
80+
temp += (pairs_df[feature] >= threshold).values
81+
82+
results += temp
83+
84+
return pairs_df.loc[np.where(results)[0]]

setup.py

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -56,7 +56,7 @@
5656
],
5757
"asreview.preprocess.deduplicators": [
5858
"asr = asreviewcontrib.preprocess.deduplication.methods.asr:ASRDedup",
59-
"asr = asreviewcontrib.preprocess.deduplication.methods.endnote_default:ENDefaultDedup",
59+
"endnote = asreviewcontrib.preprocess.deduplication.methods.endnote_default:ENDefaultDedup",
6060
],
6161
},
6262
project_urls={

0 commit comments

Comments
 (0)