Merge pull request stanfordnlp#1211 from Anindyadeep/premai/vectorizer

krypticmouse · web-flow · commit 99b4cb288504 · 2024-06-27T15:47:35.000+05:30
Prem AI Vectorizer support
diff --git a/dsp/modules/sentence_vectorizer.py b/dsp/modules/sentence_vectorizer.py
@@ -6,14 +6,15 @@
 
 
 class BaseSentenceVectorizer(abc.ABC):
-    '''
+    """
     Base Class for Vectorizers. The main purpose is to vectorize text (doc/query)
     for ANN/KNN indexes. `__call__` method takes `List[Example]` as a single input, then extracts
     `field_to_vectorize` from every Example and convert them into embeddings.
     You can customize extraction logic in the `_extract_text_from_examples` method.
-    '''
+    """
+
     # embeddings will be computed based on the string in this attribute of Example object
-    field_to_vectorize = 'text_to_vectorize'
+    field_to_vectorize = "text_to_vectorize"
 
     def __init__(self) -> None:
         pass
@@ -24,28 +25,29 @@ def __call__(self, inp_examples: List["Example"]) -> np.ndarray:
 
     def _extract_text_from_examples(self, inp_examples: List) -> List[str]:
         if isinstance(inp_examples[0], str):
-            return inp_examples 
+            return inp_examples
         return [" ".join([example[key] for key in example._input_keys]) for example in inp_examples]
 
 
 class SentenceTransformersVectorizer(BaseSentenceVectorizer):
-    '''
+    """
     Vectorizer based on `SentenceTransformers` models. You can pick any model from this link:
     https://huggingface.co/models?library=sentence-transformers
     More details about models:
     https://www.sbert.net/docs/pretrained_models.html
-    '''
+    """
+
     def __init__(
         self,
-        model_name_or_path: str = 'all-MiniLM-L6-v2',
+        model_name_or_path: str = "all-MiniLM-L6-v2",
         vectorize_bs: int = 256,
         max_gpu_devices: int = 1,
         normalize_embeddings: bool = False,
     ):
         # this isn't a good practice, but with top-level import the whole DSP
         # module import will be slow (>5 sec), because SentenceTransformer is doing
         # it's directory/file-related magic under the hood :(
-        
+
         try:
             from sentence_transformers import SentenceTransformer
         except ImportError:
@@ -55,9 +57,9 @@ def __init__(
                 "or simply run `pip install sentence-transformers",
             )
         from dsp.utils.ann_utils import determine_devices
-        
+
         self.num_devices, self.is_gpu = determine_devices(max_gpu_devices)
-        self.proxy_device = 'cuda' if self.is_gpu else 'cpu'
+        self.proxy_device = "cuda" if self.is_gpu else "cpu"
 
         self.model = SentenceTransformer(model_name_or_path, device=self.proxy_device)
 
@@ -93,42 +95,42 @@ def __call__(self, inp_examples: List) -> np.ndarray:
 
 
 class NaiveGetFieldVectorizer(BaseSentenceVectorizer):
-    '''
-    If embeddings were precomputed, then we could just extract them from the proper field 
+    """
+    If embeddings were precomputed, then we could just extract them from the proper field
     (set by `field_with_embedding`) from each `Example`.
-    '''
-    def __init__(self, field_with_embedding: str = 'vectorized'):
+    """
+
+    def __init__(self, field_with_embedding: str = "vectorized"):
         self.field_with_embedding = field_with_embedding
 
     def __call__(self, inp_examples: List["Example"]) -> np.ndarray:
-        embeddings = [
-            getattr(cur_example, self.field_with_embedding).reshape(1, -1)
-            for cur_example in inp_examples
-        ]
+        embeddings = [getattr(cur_example, self.field_with_embedding).reshape(1, -1) for cur_example in inp_examples]
         embeddings = np.concatenate(embeddings, axis=0).astype(np.float32)
         return embeddings
 
 
 class CohereVectorizer(BaseSentenceVectorizer):
-    '''
+    """
     This vectorizer uses the Cohere API to convert texts to embeddings.
     More about the available models: https://docs.cohere.com/reference/embed
     `api_key` should be passed as an argument and can be retrieved
     from https://dashboard.cohere.com/api-keys
-    '''
+    """
+
     def __init__(
         self,
         api_key: str,
-        model: str = 'embed-english-v3.0',
+        model: str = "embed-english-v3.0",
         embed_batch_size: int = 96,
-        embedding_type: str = 'search_document',  # for details check Cohere embed docs
+        embedding_type: str = "search_document",  # for details check Cohere embed docs
     ):
         self.model = model
         self.embed_batch_size = embed_batch_size
         self.embedding_type = embedding_type
 
         import cohere
-        self.client = cohere.Client(api_key, client_name='dspy')
+
+        self.client = cohere.Client(api_key, client_name="dspy")
 
     def __call__(self, inp_examples: List["Example"]) -> np.ndarray:
         text_to_vectorize = self._extract_text_from_examples(inp_examples)
@@ -139,7 +141,7 @@ def __call__(self, inp_examples: List["Example"]) -> np.ndarray:
         for cur_batch_idx in range(n_batches):
             start_idx = cur_batch_idx * self.embed_batch_size
             end_idx = (cur_batch_idx + 1) * self.embed_batch_size
-            cur_batch = text_to_vectorize[start_idx: end_idx]
+            cur_batch = text_to_vectorize[start_idx:end_idx]
 
             response = self.client.embed(
                 texts=cur_batch,
@@ -160,14 +162,15 @@ def __call__(self, inp_examples: List["Example"]) -> np.ndarray:
 
 
 class OpenAIVectorizer(BaseSentenceVectorizer):
-    '''
+    """
     This vectorizer uses OpenAI API to convert texts to embeddings. Changing `model` is not
     recommended. More about the model: https://openai.com/blog/new-and-improved-embedding-model/
     `api_key` should be passed as an argument or as env variable (`OPENAI_API_KEY`).
-    '''
+    """
+
     def __init__(
         self,
-        model: str = 'text-embedding-ada-002',
+        model: str = "text-embedding-ada-002",
         embed_batch_size: int = 1024,
         api_key: Optional[str] = None,
     ):
@@ -191,19 +194,20 @@ def __call__(self, inp_examples: List["Example"]) -> np.ndarray:
         for cur_batch_idx in range(n_batches):  # tqdm.tqdm?
             start_idx = cur_batch_idx * self.embed_batch_size
             end_idx = (cur_batch_idx + 1) * self.embed_batch_size
-            cur_batch = text_to_vectorize[start_idx: end_idx]
+            cur_batch = text_to_vectorize[start_idx:end_idx]
             # OpenAI API call:
             response = self.Embedding.create(
                 model=self.model,
                 input=cur_batch,
             )
 
-            cur_batch_embeddings = [cur_obj['embedding'] for cur_obj in response['data']]
+            cur_batch_embeddings = [cur_obj["embedding"] for cur_obj in response["data"]]
             embeddings_list.extend(cur_batch_embeddings)
 
         embeddings = np.array(embeddings_list, dtype=np.float32)
         return embeddings
 
+
 class FastEmbedVectorizer(BaseSentenceVectorizer):
     """Sentence vectorizer implementaion using FastEmbed - https://qdrant.github.io/fastembed."""
 
@@ -247,4 +251,58 @@ def __call__(self, inp_examples: List["Example"]) -> np.ndarray:
         texts_to_vectorize = self._extract_text_from_examples(inp_examples)
         embeddings = self._model.embed(texts_to_vectorize, batch_size=self._batch_size, parallel=self._parallel)
 
-        return np.array([embedding.tolist() for embedding in embeddings], dtype=np.float32)
+        return np.array([embedding.tolist() for embedding in embeddings], dtype=np.float32)
+
+
+class PremAIVectorizer(BaseSentenceVectorizer):
+    """The PremAIVectorizer class utilizes the PremAI Embeddings API to convert text into embeddings.
+    This vectorizer leverages various models provided by PremAI.
+
+    For detailed information on the supported models, visit: https://docs.premai.io/get-started/supported-models.
+
+    The `project_id` is a mandatory argument, while `api_key` and `model_name` are optional. The `api_key`
+    can be supplied either as an argument or through an environment variable. By default, the `model_name`
+    is set to "text-embedding-3-large", unless specified otherwise.
+
+    To learn more about getting started with PremAI, visit: https://docs.premai.io/introduction.
+    """
+
+    def __init__(
+        self,
+        project_id: str,
+        api_key: Optional[str] = None,
+        model_name: Optional[str] = "text-embedding-3-large",
+        embed_batch_size: int = 32,
+    ):
+        self.model_name, self.project_id = model_name, project_id
+        self.embed_batch_size = embed_batch_size
+
+        try:
+            from premai import Prem
+
+            from dsp.modules.premai import get_premai_api_key
+
+            api_key = get_premai_api_key(api_key=api_key)
+            self.client = Prem(api_key=api_key)
+        except ImportError as error:
+            raise ImportError("Please install premai package using: pip install premai") from error
+
+    def __call__(self, inp_examples: List["Example"]) -> np.ndarray:
+        text_to_vectorize = self._extract_text_from_examples(inp_examples)
+        embedding_list = []
+
+        n_batches = (len(text_to_vectorize) - 1) // self.embed_batch_size + 1
+        for cur_batch_idx in range(n_batches):
+            start_idx = cur_batch_idx * self.embed_batch_size
+            end_idx = (cur_batch_idx + 1) * self.embed_batch_size
+            current_batch = text_to_vectorize[start_idx:end_idx]
+            embeddings = self.client.embeddings.create(
+                project_id=self.project_id,
+                model=self.model_name,
+                input=current_batch,
+            ).data
+            current_batch_embeddings = [embedding.embedding for embedding in embeddings]
+            embedding_list.extend(current_batch_embeddings)
+
+        embeddings = np.array(embedding_list, dtype=np.float32)
+        return embeddings