88
99
1010class CumulativeSimSplitter (BaseSplitter ):
11-
1211 """
13- Called "cumulative sim" because we check the similarities of the embeddings of cumulative concatenated documents with the next document.
12+ Called "cumulative sim" because we check the similarities of the
13+ embeddings of cumulative concatenated documents with the next document.
1414 """
1515
1616 def __init__ (
@@ -19,26 +19,30 @@ def __init__(
1919 name : str = "cumulative_similarity_splitter" ,
2020 score_threshold : float = 0.45 ,
2121 ):
22- super ().__init__ (name = name , score_threshold = score_threshold , encoder = encoder )
22+ super ().__init__ (name = name , encoder = encoder )
2323 encoder .score_threshold = score_threshold
24+ self .score_threshold = score_threshold
2425
2526 def __call__ (self , docs : List [str ]):
2627 total_docs = len (docs )
2728 # Check if there's only a single document
2829 if total_docs == 1 :
2930 raise ValueError (
30- "There is only one document provided; at least two are required to determine topics based on similarity."
31+ "There is only one document provided; at least two are required "
32+ "to determine topics based on similarity."
3133 )
3234 splits = []
3335 curr_split_start_idx = 0
3436
3537 for idx in range (0 , total_docs ):
3638 if idx + 1 < total_docs : # Ensure there is a next document to compare with.
3739 if idx == 0 :
38- # On the first iteration, compare the first document directly to the second.
40+ # On the first iteration, compare the
41+ # first document directly to the second.
3942 curr_split_docs = docs [idx ]
4043 else :
41- # For subsequent iterations, compare cumulative documents up to the current one with the next.
44+ # For subsequent iterations, compare cumulative
45+ # documents up to the current one with the next.
4246 curr_split_docs = "\n " .join (docs [curr_split_start_idx : idx + 1 ])
4347 next_doc = docs [idx + 1 ]
4448
0 commit comments