feat: Jina reranker toolkit (camel-ai#2170)

JINO-ROHIT · Wendong-Fan · web-flow · commit 48687a609902 · 2025-04-23T03:47:38.000+08:00
Co-authored-by: Wendong-Fan &lt;133094783+Wendong-Fan@users.noreply.github.com&gt;
diff --git a/camel/toolkits/__init__.py b/camel/toolkits/__init__.py
@@ -67,6 +67,7 @@
 from .pyautogui_toolkit import PyAutoGUIToolkit
 from .openai_agent_toolkit import OpenAIAgentToolkit
 from .searxng_toolkit import SearxNGToolkit
+from .jina_reranker_toolkit import JinaRerankerToolkit
 
 
 __all__ = [
@@ -122,4 +123,5 @@
     'PyAutoGUIToolkit',
     'OpenAIAgentToolkit',
     'SearxNGToolkit',
+    'JinaRerankerToolkit',
 ]
diff --git a/camel/toolkits/jina_reranker_toolkit.py b/camel/toolkits/jina_reranker_toolkit.py
@@ -0,0 +1,223 @@
+# ========= Copyright 2023-2024 @ CAMEL-AI.org. All Rights Reserved. =========
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+# ========= Copyright 2023-2024 @ CAMEL-AI.org. All Rights Reserved. =========
+from typing import List, Optional, Tuple
+
+import torch
+from transformers import AutoModel
+
+from camel.toolkits import FunctionTool
+from camel.toolkits.base import BaseToolkit
+from camel.utils import MCPServer
+
+
+@MCPServer()
+class JinaRerankerToolkit(BaseToolkit):
+    r"""A class representing a toolkit for reranking documents
+    using Jina Reranker.
+
+    This class provides methods for reranking documents (text or images)
+    based on their relevance to a given query using the Jina Reranker model.
+    """
+
+    def __init__(
+        self,
+        timeout: Optional[float] = None,
+        device: Optional[str] = None,
+    ) -> None:
+        r"""Initializes a new instance of the JinaRerankerToolkit class.
+
+        Args:
+            timeout (Optional[float]): The timeout value for API requests
+                in seconds. If None, no timeout is applied.
+                (default: :obj:`None`)
+            device (Optional[str]): Device to load the model on. If None,
+                will use CUDA if available, otherwise CPU.
+                (default: :obj:`None`)
+        """
+        super().__init__(timeout=timeout)
+
+        self.model = AutoModel.from_pretrained(
+            'jinaai/jina-reranker-m0',
+            torch_dtype="auto",
+            trust_remote_code=True,
+        )
+        DEVICE = (
+            device
+            if device is not None
+            else ("cuda" if torch.cuda.is_available() else "cpu")
+        )
+        self.model.to(DEVICE)
+        self.model.eval()
+
+    def _sort_documents(
+        self, documents: List[str], scores: List[float]
+    ) -> List[Tuple[str, float]]:
+        r"""Sort documents by their scores in descending order.
+
+        Args:
+            documents (List[str]): List of documents to sort.
+            scores (List[float]): Corresponding scores for each document.
+
+        Returns:
+            List[Tuple[str, float]]: Sorted list of (document, score) pairs.
+
+        Raises:
+            ValueError: If documents and scores have different lengths.
+        """
+        if len(documents) != len(scores):
+            raise ValueError("Number of documents must match number of scores")
+        doc_score_pairs = list(zip(documents, scores))
+        doc_score_pairs.sort(key=lambda x: x[1], reverse=True)
+
+        return doc_score_pairs
+
+    def rerank_text_documents(
+        self,
+        query: str,
+        documents: List[str],
+        max_length: int = 1024,
+    ) -> List[Tuple[str, float]]:
+        r"""Reranks text documents based on their relevance to a text query.
+
+        Args:
+            query (str): The text query for reranking.
+            documents (List[str]): List of text documents to be reranked.
+            max_length (int): Maximum token length for processing.
+                (default: :obj:`1024`)
+
+        Returns:
+            List[Tuple[str, float]]: A list of tuples containing
+                the reranked documents and their relevance scores.
+        """
+        if self.model is None:
+            raise ValueError(
+                "Model has not been initialized or failed to initialize."
+            )
+
+        with torch.inference_mode():
+            text_pairs = [[query, doc] for doc in documents]
+            scores = self.model.compute_score(
+                text_pairs, max_length=max_length, doc_type="text"
+            )
+
+        return self._sort_documents(documents, scores)
+
+    def rerank_image_documents(
+        self,
+        query: str,
+        documents: List[str],
+        max_length: int = 2048,
+    ) -> List[Tuple[str, float]]:
+        r"""Reranks image documents based on their relevance to a text query.
+
+        Args:
+            query (str): The text query for reranking.
+            documents (List[str]): List of image URLs or paths to be reranked.
+            max_length (int): Maximum token length for processing.
+                (default: :obj:`2048`)
+
+        Returns:
+            List[Tuple[str, float]]: A list of tuples containing
+                the reranked image URLs/paths and their relevance scores.
+        """
+        if self.model is None:
+            raise ValueError(
+                "Model has not been initialized or failed to initialize."
+            )
+
+        with torch.inference_mode():
+            image_pairs = [[query, doc] for doc in documents]
+            scores = self.model.compute_score(
+                image_pairs, max_length=max_length, doc_type="image"
+            )
+
+        return self._sort_documents(documents, scores)
+
+    def image_query_text_documents(
+        self,
+        image_query: str,
+        documents: List[str],
+        max_length: int = 2048,
+    ) -> List[Tuple[str, float]]:
+        r"""Reranks text documents based on their relevance to an image query.
+
+        Args:
+            image_query (str): The image URL or path used as query.
+            documents (List[str]): List of text documents to be reranked.
+            max_length (int): Maximum token length for processing.
+                (default: :obj:`2048`)
+
+        Returns:
+            List[Tuple[str, float]]: A list of tuples containing
+                the reranked documents and their relevance scores.
+        """
+        if self.model is None:
+            raise ValueError("Model has not been initialized.")
+        with torch.inference_mode():
+            image_pairs = [[image_query, doc] for doc in documents]
+            scores = self.model.compute_score(
+                image_pairs,
+                max_length=max_length,
+                query_type="image",
+                doc_type="text",
+            )
+
+        return self._sort_documents(documents, scores)
+
+    def image_query_image_documents(
+        self,
+        image_query: str,
+        documents: List[str],
+        max_length: int = 2048,
+    ) -> List[Tuple[str, float]]:
+        r"""Reranks image documents based on their relevance to an image query.
+
+        Args:
+            image_query (str): The image URL or path used as query.
+            documents (List[str]): List of image URLs or paths to be reranked.
+            max_length (int): Maximum token length for processing.
+                (default: :obj:`2048`)
+
+        Returns:
+            List[Tuple[str, float]]: A list of tuples containing
+                the reranked image URLs/paths and their relevance scores.
+        """
+        if self.model is None:
+            raise ValueError("Model has not been initialized.")
+
+        with torch.inference_mode():
+            image_pairs = [[image_query, doc] for doc in documents]
+            scores = self.model.compute_score(
+                image_pairs,
+                max_length=max_length,
+                query_type="image",
+                doc_type="image",
+            )
+
+        return self._sort_documents(documents, scores)
+
+    def get_tools(self) -> List[FunctionTool]:
+        r"""Returns a list of FunctionTool objects representing the
+        functions in the toolkit.
+
+        Returns:
+            List[FunctionTool]: A list of FunctionTool objects
+                representing the functions in the toolkit.
+        """
+        return [
+            FunctionTool(self.rerank_text_documents),
+            FunctionTool(self.rerank_image_documents),
+            FunctionTool(self.image_query_text_documents),
+            FunctionTool(self.image_query_image_documents),
+        ]
diff --git a/docs/key_modules/tools.md b/docs/key_modules/tools.md
@@ -166,6 +166,7 @@ CAMEL provides a variety of built-in toolkits that you can use right away. Here'
 | GoogleScholarToolkit | A toolkit for retrieving information about authors and their publications from Google Scholar. |
 | HumanToolkit | A toolkit for facilitating human-in-the-loop interactions and feedback in AI systems. |
 | ImageAnalysisToolkit | A toolkit for comprehensive image analysis and understanding using vision-capable language models. |
+| JinaRerankerToolkit | A toolkit for reranking documents (text or images) based on their relevance to a given query using the Jina Reranker model. |
 | LinkedInToolkit | A toolkit for LinkedIn operations including creating posts, deleting posts, and retrieving user profile information. |
 | MathToolkit | A toolkit for performing basic mathematical operations such as addition, subtraction, and multiplication. |
 | MCPToolkit | A toolkit for interacting with external tools using the Model Context Protocol (MCP).  |
diff --git a/examples/toolkits/jina_reranker_toolkit.py b/examples/toolkits/jina_reranker_toolkit.py
@@ -0,0 +1,63 @@
+# ========= Copyright 2023-2024 @ CAMEL-AI.org. All Rights Reserved. =========
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+# ========= Copyright 2023-2024 @ CAMEL-AI.org. All Rights Reserved. =========
+
+from camel.agents import ChatAgent
+from camel.models import ModelFactory
+from camel.toolkits import JinaRerankerToolkit
+from camel.types import ModelPlatformType, ModelType
+
+model = ModelFactory.create(
+    model_platform=ModelPlatformType.DEFAULT,
+    model_type=ModelType.DEFAULT,
+)
+
+reranker_toolkit = JinaRerankerToolkit(device="cpu")
+reranker_tool = reranker_toolkit.get_tools()
+
+agent = ChatAgent(model=model, tools=reranker_tool)
+
+documents = [
+    "Markdown is a lightweight markup language with plain-text "
+    "formatting syntax.",
+    "Python is a high-level, interpreted programming language known for "
+    "its readability.",
+    "SLM (Small Language Models) are compact AI models designed for "
+    "specific tasks.",
+    "JavaScript is a scripting language primarily used for "
+    "creating interactive web pages.",
+]
+
+query = "How to use markdown with small language models"
+
+response = agent.step(
+    f"Can you rerank these documents {documents} against the query {query}"
+)
+print(str(response.info['tool_calls'])[:1000])
+""""
+===========================================================================
+[ToolCallingRecord(tool_name='rerank_text_documents', args={'query': 'How to 
+use markdown with small language models', 'documents': ['Markdown is a 
+lightweight markup language with plain-text formatting syntax.', 'Python is a 
+high-level, interpreted programming language known for its readability.', 'SLM 
+(Small Language Models) are compact AI models designed for specific tasks.', 
+'JavaScript is a scripting language primarily used for creating interactive 
+web pages.'], 'max_length': 1024}, result=[('Markdown is a lightweight markup 
+language with plain-text formatting syntax.', 0.7915633916854858), ('SLM 
+(Small Language Models) are compact AI models designed for specific tasks.', 0.
+7915633916854858), ('Python is a high-level, interpreted programming language 
+known for its readability.', 0.43936243653297424), ('JavaScript is a scripting 
+language primarily used for creating interactive web pages.', 0.
+3716837763786316)], tool_call_id='call_JKnuvTO1fUQP7PWhyCSQCK7N')]
+===========================================================================
+"""
diff --git a/test/toolkits/test_jina_reranker_toolkit.py b/test/toolkits/test_jina_reranker_toolkit.py