fix: typing issues, bug in infernece (MinishLab#224)

stephantul · web-flow · commit 77f16dfa7c18 · 2025-04-25T11:24:22.000+02:00
diff --git a/model2vec/inference/model.py b/model2vec/inference/model.py
@@ -3,7 +3,7 @@
 import re
 from pathlib import Path
 from tempfile import TemporaryDirectory
-from typing import TypeVar
+from typing import Sequence, TypeVar
 
 import huggingface_hub
 import numpy as np
@@ -65,11 +65,12 @@ def save_pretrained(self, path: str) -> None:
         """Save the model to a folder."""
         save_pipeline(self, path)
 
-    def push_to_hub(self, repo_id: str, token: str | None = None, private: bool = False) -> None:
+    def push_to_hub(self, repo_id: str, subfolder: str, token: str | None = None, private: bool = False) -> None:
         """
         Save a model to a folder, and then push that folder to the hf hub.
 
         :param repo_id: The id of the repository to push to.
+        :param subfolder: The subfolder to push to.
         :param token: The token to use to push to the hub.
         :param private: Whether the repository should be private.
         """
@@ -78,11 +79,11 @@ def push_to_hub(self, repo_id: str, token: str | None = None, private: bool = Fa
         with TemporaryDirectory() as temp_dir:
             save_pipeline(self, temp_dir)
             self.model.save_pretrained(temp_dir)
-            push_folder_to_hub(Path(temp_dir), repo_id, private, token)
+            push_folder_to_hub(Path(temp_dir), subfolder, repo_id, private, token)
 
     def _encode_and_coerce_to_2d(
         self,
-        X: list[str] | str,
+        X: Sequence[str],
         show_progress_bar: bool,
         max_length: int | None,
         batch_size: int,
@@ -105,7 +106,7 @@ def _encode_and_coerce_to_2d(
 
     def predict(
         self,
-        X: list[str] | str,
+        X: Sequence[str],
         show_progress_bar: bool = False,
         max_length: int | None = 512,
         batch_size: int = 1024,
@@ -145,7 +146,7 @@ def predict(
 
     def predict_proba(
         self,
-        X: list[str] | str,
+        X: Sequence[str],
         show_progress_bar: bool = False,
         max_length: int | None = 512,
         batch_size: int = 1024,
@@ -175,7 +176,7 @@ def predict_proba(
         return self.head.predict_proba(encoded)
 
     def evaluate(
-        self, X: list[str], y: LabelType, batch_size: int = 1024, threshold: float = 0.5, output_dict: bool = False
+        self, X: Sequence[str], y: LabelType, batch_size: int = 1024, threshold: float = 0.5, output_dict: bool = False
     ) -> str | dict[str, dict[str, float]]:
         """
         Evaluate the classifier on a given dataset using scikit-learn's classification report.
diff --git a/model2vec/model.py b/model2vec/model.py
@@ -5,7 +5,7 @@
 from logging import getLogger
 from pathlib import Path
 from tempfile import TemporaryDirectory
-from typing import Any, Iterator, Union
+from typing import Any, Iterator, Sequence, Union, overload
 
 import numpy as np
 from joblib import delayed
@@ -117,7 +117,7 @@ def save_pretrained(self, path: PathLike, model_name: str | None = None, subfold
             subfolder=subfolder,
         )
 
-    def tokenize(self, sentences: list[str], max_length: int | None = None) -> list[list[int]]:
+    def tokenize(self, sentences: Sequence[str], max_length: int | None = None) -> list[list[int]]:
         """
         Tokenize a list of sentences.
 
@@ -245,9 +245,31 @@ def from_sentence_transformers(
             language=metadata.get("language"),
         )
 
+    @overload
     def encode_as_sequence(
         self,
-        sentences: list[str] | str,
+        sentences: str,
+        max_length: int | None = None,
+        batch_size: int = 1024,
+        show_progress_bar: bool = False,
+        use_multiprocessing: bool = True,
+        multiprocessing_threshold: int = 10_000,
+    ) -> np.ndarray: ...
+
+    @overload
+    def encode_as_sequence(
+        self,
+        sentences: list[str],
+        max_length: int | None = None,
+        batch_size: int = 1024,
+        show_progress_bar: bool = False,
+        use_multiprocessing: bool = True,
+        multiprocessing_threshold: int = 10_000,
+    ) -> list[np.ndarray]: ...
+
+    def encode_as_sequence(
+        self,
+        sentences: str | list[str],
         max_length: int | None = None,
         batch_size: int = 1024,
         show_progress_bar: bool = False,
@@ -263,6 +285,9 @@ def encode_as_sequence(
         This is about twice as slow.
         Sentences that do not contain any tokens will be turned into an empty array.
 
+        NOTE: the input type is currently underspecified. The actual input type is `Sequence[str] | str`, but this
+            is not possible to implement in python typing currently.
+
         :param sentences: The list of sentences to encode.
         :param max_length: The maximum length of the sentences. Any tokens beyond this length will be truncated.
             If this is None, no truncation is done.
@@ -320,7 +345,7 @@ def _encode_batch_as_sequence(self, sentences: list[str], max_length: int | None
 
     def encode(
         self,
-        sentences: list[str] | str,
+        sentences: Sequence[str],
         show_progress_bar: bool = False,
         max_length: int | None = 512,
         batch_size: int = 1024,
@@ -334,6 +359,9 @@ def encode(
         This function encodes a list of sentences by averaging the word embeddings of the tokens in the sentence.
         For ease of use, we don't batch sentences together.
 
+        NOTE: the return type is currently underspecified. In the case of a single string, this returns a 1D array,
+            but in the case of a list of strings, this returns a 2D array. Not possible to implement in numpy currently.
+
         :param sentences: The list of sentences to encode. You can also pass a single sentence.
         :param show_progress_bar: Whether to show the progress bar.
         :param max_length: The maximum length of the sentences. Any tokens beyond this length will be truncated.
@@ -378,7 +406,7 @@ def encode(
             return out_array[0]
         return out_array
 
-    def _encode_batch(self, sentences: list[str], max_length: int | None) -> np.ndarray:
+    def _encode_batch(self, sentences: Sequence[str], max_length: int | None) -> np.ndarray:
         """Encode a batch of sentences."""
         ids = self.tokenize(sentences=sentences, max_length=max_length)
         out: list[np.ndarray] = []
@@ -396,7 +424,7 @@ def _encode_batch(self, sentences: list[str], max_length: int | None) -> np.ndar
         return out_array
 
     @staticmethod
-    def _batch(sentences: list[str], batch_size: int) -> Iterator[list[str]]:
+    def _batch(sentences: Sequence[str], batch_size: int) -> Iterator[Sequence[str]]:
         """Batch the sentences into equal-sized."""
         return (sentences[i : i + batch_size] for i in range(0, len(sentences), batch_size))