Skip to content

Commit a3c9211

Browse files
feat: add image analysis toolkit (camel-ai#1741)
Co-authored-by: Wendong-Fan <[email protected]> Co-authored-by: Wendong <[email protected]>
1 parent 0243737 commit a3c9211

File tree

4 files changed

+442
-0
lines changed

4 files changed

+442
-0
lines changed

camel/toolkits/__init__.py

Lines changed: 2 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -50,6 +50,7 @@
5050
from .zapier_toolkit import ZapierToolkit
5151
from .sympy_toolkit import SymPyToolkit
5252
from .mineru_toolkit import MinerUToolkit
53+
from .image_analysis_toolkit import ImageAnalysisToolkit
5354

5455

5556
__all__ = [
@@ -88,4 +89,5 @@
8889
'ZapierToolkit',
8990
'SymPyToolkit',
9091
'MinerUToolkit',
92+
'ImageAnalysisToolkit',
9193
]
Lines changed: 205 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,205 @@
1+
# ========= Copyright 2023-2024 @ CAMEL-AI.org. All Rights Reserved. =========
2+
# Licensed under the Apache License, Version 2.0 (the "License");
3+
# you may not use this file except in compliance with the License.
4+
# You may obtain a copy of the License at
5+
#
6+
# http://www.apache.org/licenses/LICENSE-2.0
7+
#
8+
# Unless required by applicable law or agreed to in writing, software
9+
# distributed under the License is distributed on an "AS IS" BASIS,
10+
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
11+
# See the License for the specific language governing permissions and
12+
# limitations under the License.
13+
# ========= Copyright 2023-2024 @ CAMEL-AI.org. All Rights Reserved. =========
14+
15+
from io import BytesIO
16+
from typing import List, Optional
17+
from urllib.parse import urlparse
18+
19+
import requests
20+
from PIL import Image
21+
22+
from camel.logger import get_logger
23+
from camel.messages import BaseMessage
24+
from camel.models import BaseModelBackend, ModelFactory
25+
from camel.toolkits import FunctionTool
26+
from camel.toolkits.base import BaseToolkit
27+
from camel.types import ModelPlatformType, ModelType
28+
29+
logger = get_logger(__name__)
30+
31+
32+
class ImageAnalysisToolkit(BaseToolkit):
33+
r"""A toolkit for comprehensive image analysis and understanding.
34+
The toolkit uses vision-capable language models to perform these tasks.
35+
"""
36+
37+
def __init__(self, model: Optional[BaseModelBackend] = None):
38+
r"""Initialize the ImageAnalysisToolkit.
39+
40+
Args:
41+
model (Optional[BaseModelBackend]): The model backend to use for
42+
image analysis tasks. This model should support processing
43+
images for tasks like image description and visual question
44+
answering. If None, a default model will be created using
45+
ModelFactory. (default: :obj:`None`)
46+
"""
47+
if model:
48+
self.model = model
49+
else:
50+
self.model = ModelFactory.create(
51+
model_platform=ModelPlatformType.DEFAULT,
52+
model_type=ModelType.DEFAULT,
53+
)
54+
55+
def image_to_text(
56+
self, image_path: str, sys_prompt: Optional[str] = None
57+
) -> str:
58+
r"""Generates textual description of an image with optional custom
59+
prompt.
60+
61+
Args:
62+
image_path (str): Local path or URL to an image file.
63+
sys_prompt (Optional[str]): Custom system prompt for the analysis.
64+
(default: :obj:`None`)
65+
66+
Returns:
67+
str: Natural language description of the image.
68+
"""
69+
default_content = '''You are an image analysis expert. Provide a
70+
detailed description including text if present.'''
71+
72+
system_msg = BaseMessage.make_assistant_message(
73+
role_name="Senior Computer Vision Analyst",
74+
content=sys_prompt if sys_prompt else default_content,
75+
)
76+
77+
return self._analyze_image(
78+
image_path=image_path,
79+
prompt="Please describe the contents of this image.",
80+
system_message=system_msg,
81+
)
82+
83+
def ask_question_about_image(
84+
self, image_path: str, question: str, sys_prompt: Optional[str] = None
85+
) -> str:
86+
r"""Answers image questions with optional custom instructions.
87+
88+
Args:
89+
image_path (str): Local path or URL to an image file.
90+
question (str): Query about the image content.
91+
sys_prompt (Optional[str]): Custom system prompt for the analysis.
92+
(default: :obj:`None`)
93+
94+
Returns:
95+
str: Detailed answer based on visual understanding
96+
"""
97+
default_content = """Answer questions about images by:
98+
1. Careful visual inspection
99+
2. Contextual reasoning
100+
3. Text transcription where relevant
101+
4. Logical deduction from visual evidence"""
102+
103+
system_msg = BaseMessage.make_assistant_message(
104+
role_name="Visual QA Specialist",
105+
content=sys_prompt if sys_prompt else default_content,
106+
)
107+
108+
return self._analyze_image(
109+
image_path=image_path,
110+
prompt=question,
111+
system_message=system_msg,
112+
)
113+
114+
def _load_image(self, image_path: str) -> Image.Image:
115+
r"""Loads an image from either local path or URL.
116+
117+
Args:
118+
image_path (str): Local path or URL to image.
119+
120+
Returns:
121+
Image.Image: Loaded PIL Image object.
122+
123+
Raises:
124+
ValueError: For invalid paths/URLs or unreadable images.
125+
requests.exceptions.RequestException: For URL fetch failures.
126+
"""
127+
parsed = urlparse(image_path)
128+
129+
if parsed.scheme in ("http", "https"):
130+
logger.debug(f"Fetching image from URL: {image_path}")
131+
try:
132+
response = requests.get(image_path, timeout=15)
133+
response.raise_for_status()
134+
return Image.open(BytesIO(response.content))
135+
except requests.exceptions.RequestException as e:
136+
logger.error(f"URL fetch failed: {e}")
137+
raise
138+
else:
139+
logger.debug(f"Loading local image: {image_path}")
140+
try:
141+
with Image.open(image_path) as img:
142+
# Load immediately to detect errors
143+
img.load()
144+
return img.copy()
145+
except Exception as e:
146+
logger.error(f"Image loading failed: {e}")
147+
raise ValueError(f"Invalid image file: {e}")
148+
149+
def _analyze_image(
150+
self,
151+
image_path: str,
152+
prompt: str,
153+
system_message: BaseMessage,
154+
) -> str:
155+
r"""Core analysis method handling image loading and processing.
156+
157+
Args:
158+
image_path (str): Image location.
159+
prompt (str): Analysis query/instructions.
160+
system_message (BaseMessage): Custom system prompt for the
161+
analysis.
162+
163+
Returns:
164+
str: Analysis result or error message.
165+
"""
166+
try:
167+
image = self._load_image(image_path)
168+
logger.info(f"Analyzing image: {image_path}")
169+
170+
from camel.agents.chat_agent import ChatAgent
171+
172+
agent = ChatAgent(
173+
system_message=system_message,
174+
model=self.model,
175+
)
176+
177+
user_msg = BaseMessage.make_user_message(
178+
role_name="User",
179+
content=prompt,
180+
image_list=[image],
181+
)
182+
183+
response = agent.step(user_msg)
184+
agent.reset()
185+
return response.msgs[0].content
186+
187+
except (ValueError, requests.exceptions.RequestException) as e:
188+
logger.error(f"Image handling error: {e}")
189+
return f"Image error: {e!s}"
190+
except Exception as e:
191+
logger.error(f"Unexpected error: {e}")
192+
return f"Analysis failed: {e!s}"
193+
194+
def get_tools(self) -> List[FunctionTool]:
195+
r"""Returns a list of FunctionTool objects representing the functions
196+
in the toolkit.
197+
198+
Returns:
199+
List[FunctionTool]: A list of FunctionTool objects representing the
200+
functions in the toolkit.
201+
"""
202+
return [
203+
FunctionTool(self.image_to_text),
204+
FunctionTool(self.ask_question_about_image),
205+
]
Lines changed: 60 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,60 @@
1+
# ========= Copyright 2023-2024 @ CAMEL-AI.org. All Rights Reserved. =========
2+
# Licensed under the Apache License, Version 2.0 (the "License");
3+
# you may not use this file except in compliance with the License.
4+
# You may obtain a copy of the License at
5+
#
6+
# http://www.apache.org/licenses/LICENSE-2.0
7+
#
8+
# Unless required by applicable law or agreed to in writing, software
9+
# distributed under the License is distributed on an "AS IS" BASIS,
10+
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
11+
# See the License for the specific language governing permissions and
12+
# limitations under the License.
13+
# ========= Copyright 2023-2024 @ CAMEL-AI.org. All Rights Reserved. =========
14+
15+
from camel.agents import ChatAgent
16+
from camel.messages.base import BaseMessage
17+
from camel.models import ModelFactory
18+
from camel.toolkits import ImageAnalysisToolkit
19+
from camel.types import ModelPlatformType, ModelType
20+
21+
model = ModelFactory.create(
22+
model_platform=ModelPlatformType.DEFAULT,
23+
model_type=ModelType.DEFAULT,
24+
)
25+
26+
image_analysis_toolkit = ImageAnalysisToolkit(model=model)
27+
28+
agent = ChatAgent(
29+
system_message="You are a helpful assistant.",
30+
model=model,
31+
tools=[*image_analysis_toolkit.get_tools()],
32+
)
33+
34+
35+
user_msg = BaseMessage.make_user_message(
36+
role_name="User",
37+
content='''
38+
The image link is: https://upload.wikimedia.org/wikipedia/commons/
39+
thumb/d/dd/Gfp-wisconsin-madison-the-nature-boardwalk.jpg/
40+
2560px-Gfp-wisconsin-madison-the-nature-boardwalk.jpg
41+
What's in this image? You must use image analysis to help me.
42+
''',
43+
)
44+
response = agent.step(user_msg)
45+
print(response.msgs[0].content)
46+
""""
47+
===========================================================================
48+
The image depicts a serene landscape featuring a wooden boardwalk that leads
49+
through a lush, green marsh or meadow. The boardwalk is centrally positioned,
50+
extending into the distance and inviting viewers to imagine walking along it.
51+
On either side of the boardwalk, tall grass and various vegetation create a
52+
vibrant green expanse.
53+
54+
In the background, there are clusters of trees and shrubs, adding depth to the
55+
scene. The sky above is mostly clear with a few scattered clouds, showcasing a
56+
gradient of blue hues. The overall atmosphere is tranquil and natural,
57+
suggesting a peaceful outdoor setting, with soft lighting that likely
58+
indicates early morning or late afternoon."
59+
============================================================================
60+
"""

0 commit comments

Comments
 (0)