feat: add image analysis toolkit (camel-ai#1741)

raywhoelse · Wendong-Fan · web-flow · commit a3c92113f321 · 2025-03-09T02:20:40.000+08:00
Co-authored-by: Wendong-Fan &lt;133094783+Wendong-Fan@users.noreply.github.com&gt;
Co-authored-by: Wendong &lt;w3ndong.fan@gmail.com&gt;
diff --git a/camel/toolkits/__init__.py b/camel/toolkits/__init__.py
@@ -50,6 +50,7 @@
 from .zapier_toolkit import ZapierToolkit
 from .sympy_toolkit import SymPyToolkit
 from .mineru_toolkit import MinerUToolkit
+from .image_analysis_toolkit import ImageAnalysisToolkit
 
 
 __all__ = [
@@ -88,4 +89,5 @@
     'ZapierToolkit',
     'SymPyToolkit',
     'MinerUToolkit',
+    'ImageAnalysisToolkit',
 ]
diff --git a/camel/toolkits/image_analysis_toolkit.py b/camel/toolkits/image_analysis_toolkit.py
@@ -0,0 +1,205 @@
+# ========= Copyright 2023-2024 @ CAMEL-AI.org. All Rights Reserved. =========
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+# ========= Copyright 2023-2024 @ CAMEL-AI.org. All Rights Reserved. =========
+
+from io import BytesIO
+from typing import List, Optional
+from urllib.parse import urlparse
+
+import requests
+from PIL import Image
+
+from camel.logger import get_logger
+from camel.messages import BaseMessage
+from camel.models import BaseModelBackend, ModelFactory
+from camel.toolkits import FunctionTool
+from camel.toolkits.base import BaseToolkit
+from camel.types import ModelPlatformType, ModelType
+
+logger = get_logger(__name__)
+
+
+class ImageAnalysisToolkit(BaseToolkit):
+    r"""A toolkit for comprehensive image analysis and understanding.
+    The toolkit uses vision-capable language models to perform these tasks.
+    """
+
+    def __init__(self, model: Optional[BaseModelBackend] = None):
+        r"""Initialize the ImageAnalysisToolkit.
+
+        Args:
+            model (Optional[BaseModelBackend]): The model backend to use for
+                image analysis tasks. This model should support processing
+                images for tasks like image description and visual question
+                answering. If None, a default model will be created using
+                ModelFactory. (default: :obj:`None`)
+        """
+        if model:
+            self.model = model
+        else:
+            self.model = ModelFactory.create(
+                model_platform=ModelPlatformType.DEFAULT,
+                model_type=ModelType.DEFAULT,
+            )
+
+    def image_to_text(
+        self, image_path: str, sys_prompt: Optional[str] = None
+    ) -> str:
+        r"""Generates textual description of an image with optional custom
+        prompt.
+
+        Args:
+            image_path (str): Local path or URL to an image file.
+            sys_prompt (Optional[str]): Custom system prompt for the analysis.
+                (default: :obj:`None`)
+
+        Returns:
+            str: Natural language description of the image.
+        """
+        default_content = '''You are an image analysis expert. Provide a 
+            detailed description including text if present.'''
+
+        system_msg = BaseMessage.make_assistant_message(
+            role_name="Senior Computer Vision Analyst",
+            content=sys_prompt if sys_prompt else default_content,
+        )
+
+        return self._analyze_image(
+            image_path=image_path,
+            prompt="Please describe the contents of this image.",
+            system_message=system_msg,
+        )
+
+    def ask_question_about_image(
+        self, image_path: str, question: str, sys_prompt: Optional[str] = None
+    ) -> str:
+        r"""Answers image questions with optional custom instructions.
+
+        Args:
+            image_path (str): Local path or URL to an image file.
+            question (str): Query about the image content.
+            sys_prompt (Optional[str]): Custom system prompt for the analysis.
+                (default: :obj:`None`)
+
+        Returns:
+            str: Detailed answer based on visual understanding
+        """
+        default_content = """Answer questions about images by:
+            1. Careful visual inspection
+            2. Contextual reasoning
+            3. Text transcription where relevant
+            4. Logical deduction from visual evidence"""
+
+        system_msg = BaseMessage.make_assistant_message(
+            role_name="Visual QA Specialist",
+            content=sys_prompt if sys_prompt else default_content,
+        )
+
+        return self._analyze_image(
+            image_path=image_path,
+            prompt=question,
+            system_message=system_msg,
+        )
+
+    def _load_image(self, image_path: str) -> Image.Image:
+        r"""Loads an image from either local path or URL.
+
+        Args:
+            image_path (str): Local path or URL to image.
+
+        Returns:
+            Image.Image: Loaded PIL Image object.
+
+        Raises:
+            ValueError: For invalid paths/URLs or unreadable images.
+            requests.exceptions.RequestException: For URL fetch failures.
+        """
+        parsed = urlparse(image_path)
+
+        if parsed.scheme in ("http", "https"):
+            logger.debug(f"Fetching image from URL: {image_path}")
+            try:
+                response = requests.get(image_path, timeout=15)
+                response.raise_for_status()
+                return Image.open(BytesIO(response.content))
+            except requests.exceptions.RequestException as e:
+                logger.error(f"URL fetch failed: {e}")
+                raise
+        else:
+            logger.debug(f"Loading local image: {image_path}")
+            try:
+                with Image.open(image_path) as img:
+                    # Load immediately to detect errors
+                    img.load()
+                    return img.copy()
+            except Exception as e:
+                logger.error(f"Image loading failed: {e}")
+                raise ValueError(f"Invalid image file: {e}")
+
+    def _analyze_image(
+        self,
+        image_path: str,
+        prompt: str,
+        system_message: BaseMessage,
+    ) -> str:
+        r"""Core analysis method handling image loading and processing.
+
+        Args:
+            image_path (str): Image location.
+            prompt (str): Analysis query/instructions.
+            system_message (BaseMessage): Custom system prompt for the
+                analysis.
+
+        Returns:
+            str: Analysis result or error message.
+        """
+        try:
+            image = self._load_image(image_path)
+            logger.info(f"Analyzing image: {image_path}")
+
+            from camel.agents.chat_agent import ChatAgent
+
+            agent = ChatAgent(
+                system_message=system_message,
+                model=self.model,
+            )
+
+            user_msg = BaseMessage.make_user_message(
+                role_name="User",
+                content=prompt,
+                image_list=[image],
+            )
+
+            response = agent.step(user_msg)
+            agent.reset()
+            return response.msgs[0].content
+
+        except (ValueError, requests.exceptions.RequestException) as e:
+            logger.error(f"Image handling error: {e}")
+            return f"Image error: {e!s}"
+        except Exception as e:
+            logger.error(f"Unexpected error: {e}")
+            return f"Analysis failed: {e!s}"
+
+    def get_tools(self) -> List[FunctionTool]:
+        r"""Returns a list of FunctionTool objects representing the functions
+            in the toolkit.
+
+        Returns:
+            List[FunctionTool]: A list of FunctionTool objects representing the
+                functions in the toolkit.
+        """
+        return [
+            FunctionTool(self.image_to_text),
+            FunctionTool(self.ask_question_about_image),
+        ]
diff --git a/examples/toolkits/image_analysis_toolkit.py b/examples/toolkits/image_analysis_toolkit.py
@@ -0,0 +1,60 @@
+# ========= Copyright 2023-2024 @ CAMEL-AI.org. All Rights Reserved. =========
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+# ========= Copyright 2023-2024 @ CAMEL-AI.org. All Rights Reserved. =========
+
+from camel.agents import ChatAgent
+from camel.messages.base import BaseMessage
+from camel.models import ModelFactory
+from camel.toolkits import ImageAnalysisToolkit
+from camel.types import ModelPlatformType, ModelType
+
+model = ModelFactory.create(
+    model_platform=ModelPlatformType.DEFAULT,
+    model_type=ModelType.DEFAULT,
+)
+
+image_analysis_toolkit = ImageAnalysisToolkit(model=model)
+
+agent = ChatAgent(
+    system_message="You are a helpful assistant.",
+    model=model,
+    tools=[*image_analysis_toolkit.get_tools()],
+)
+
+
+user_msg = BaseMessage.make_user_message(
+    role_name="User",
+    content='''
+        The image link is: https://upload.wikimedia.org/wikipedia/commons/
+        thumb/d/dd/Gfp-wisconsin-madison-the-nature-boardwalk.jpg/
+        2560px-Gfp-wisconsin-madison-the-nature-boardwalk.jpg
+        What's in this image? You must use image analysis to help me.
+        ''',
+)
+response = agent.step(user_msg)
+print(response.msgs[0].content)
+""""
+===========================================================================
+The image depicts a serene landscape featuring a wooden boardwalk that leads 
+through a lush, green marsh or meadow. The boardwalk is centrally positioned, 
+extending into the distance and inviting viewers to imagine walking along it. 
+On either side of the boardwalk, tall grass and various vegetation create a 
+vibrant green expanse.
+
+In the background, there are clusters of trees and shrubs, adding depth to the 
+scene. The sky above is mostly clear with a few scattered clouds, showcasing a 
+gradient of blue hues. The overall atmosphere is tranquil and natural, 
+suggesting a peaceful outdoor setting, with soft lighting that likely 
+indicates early morning or late afternoon."
+============================================================================
+"""
diff --git a/test/toolkits/test_image_analysis_toolkiy.py b/test/toolkits/test_image_analysis_toolkiy.py