Skip to content

Commit cb44039

Browse files
feat: add scrapegraph-sdk integration (camel-ai#2206)
Co-authored-by: Wendong-Fan <[email protected]>
1 parent ddea8c9 commit cb44039

File tree

4 files changed

+303
-1
lines changed

4 files changed

+303
-1
lines changed
Lines changed: 96 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,96 @@
1+
# ========= Copyright 2023-2025@ CAMEL-AI.org. All Rights Reserved. =========
2+
# Licensed under the Apache License, Version 2.0 (the "License");
3+
# you may not use this file except in compliance with the License.
4+
# You may obtain a copy of the License at
5+
#
6+
# http://www.apache.org/licenses/LICENSE-2.0
7+
#
8+
# Unless required by applicable law or agreed to in writing, software
9+
# distributed under the License is distributed on an "AS IS" BASIS,
10+
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
11+
# See the License for the specific language governing permissions and
12+
# limitations under the License.
13+
# ========= Copyright 2023-2025 @ CAMEL-AI.org. All Rights Reserved. =========
14+
15+
import os
16+
from typing import Any, Dict, Optional
17+
18+
from pydantic import BaseModel
19+
20+
21+
class ScrapeGraphAI:
22+
r"""ScrapeGraphAI allows you to perform AI-powered web scraping and searching.
23+
24+
Args:
25+
api_key (Optional[str]): API key for authenticating with the ScrapeGraphAI
26+
API.
27+
28+
References:
29+
https://scrapegraph.ai/
30+
"""
31+
32+
def __init__(
33+
self,
34+
api_key: Optional[str] = None,
35+
) -> None:
36+
from scrapegraph_py import Client
37+
from scrapegraph_py.logger import sgai_logger
38+
39+
self._api_key = api_key or os.environ.get("SCRAPEGRAPH_API_KEY")
40+
sgai_logger.set_logging(level="INFO")
41+
self.client = Client(api_key=self._api_key)
42+
43+
def search(
44+
self,
45+
user_prompt: str,
46+
) -> Dict[str, Any]:
47+
r"""Perform an AI-powered web search using ScrapeGraphAI.
48+
49+
Args:
50+
user_prompt (str): The search query or instructions.
51+
52+
Returns:
53+
Dict[str, Any]: The search results including answer and reference URLs.
54+
55+
Raises:
56+
RuntimeError: If the search process fails.
57+
"""
58+
try:
59+
response = self.client.searchscraper(user_prompt=user_prompt)
60+
return response
61+
except Exception as e:
62+
raise RuntimeError(f"Failed to perform search: {e}")
63+
64+
def scrape(
65+
self,
66+
website_url: str,
67+
user_prompt: str,
68+
website_html: Optional[str] = None,
69+
) -> Dict[str, Any]:
70+
r"""Perform AI-powered web scraping using ScrapeGraphAI.
71+
72+
Args:
73+
website_url (str): The URL to scrape.
74+
user_prompt (str): Instructions for what data to extract.
75+
website_html (Optional[str]): Optional HTML content to use instead of
76+
fetching from the URL.
77+
78+
Returns:
79+
Dict[str, Any]: The scraped data including request ID and result.
80+
81+
Raises:
82+
RuntimeError: If the scrape process fails.
83+
"""
84+
try:
85+
response = self.client.smartscraper(
86+
website_url=website_url,
87+
user_prompt=user_prompt,
88+
website_html=website_html,
89+
)
90+
return response
91+
except Exception as e:
92+
raise RuntimeError(f"Failed to perform scrape: {e}")
93+
94+
def close(self) -> None:
95+
r"""Close the ScrapeGraphAI client connection."""
96+
self.client.close()
Lines changed: 100 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,100 @@
1+
# ========= Copyright 2023-2025@ CAMEL-AI.org. All Rights Reserved. =========
2+
# Licensed under the Apache License, Version 2.0 (the "License");
3+
# you may not use this file except in compliance with the License.
4+
# You may obtain a copy of the License at
5+
#
6+
# http://www.apache.org/licenses/LICENSE-2.0
7+
#
8+
# Unless required by applicable law or agreed to in writing, software
9+
# distributed under the License is distributed on an "AS IS" BASIS,
10+
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
11+
# See the License for the specific language governing permissions and
12+
# limitations under the License.
13+
# ========= Copyright 2023-2025 @ CAMEL-AI.org. All Rights Reserved. =========
14+
15+
"""
16+
Example demonstrating how to use the ScrapeGraphAI reader for web scraping and searching.
17+
18+
This example shows:
19+
1. How to initialize the ScrapeGraphAI reader
20+
2. How to perform AI-powered web searches
21+
3. How to scrape websites with specific instructions
22+
4. How to handle errors and close the connection
23+
"""
24+
25+
import os
26+
from typing import Dict, Any
27+
28+
from camel.loaders.scrapegraph_reader import ScrapeGraphAI
29+
30+
31+
def search_example(api_key: str) -> Dict[str, Any]:
32+
"""Example of performing an AI-powered web search."""
33+
# Initialize the ScrapeGraphAI reader
34+
scraper = ScrapeGraphAI(api_key=api_key)
35+
36+
try:
37+
# Perform a search
38+
search_query = "What are the latest developments in AI?"
39+
result = scraper.search(user_prompt=search_query)
40+
41+
print("\nSearch Results:")
42+
print(f"Answer: {result.get('answer', 'No answer found')}")
43+
print("References:")
44+
for url in result.get('references', []):
45+
print(f"- {url}")
46+
47+
return result
48+
finally:
49+
# Always close the connection
50+
scraper.close()
51+
52+
53+
def scrape_example(api_key: str) -> Dict[str, Any]:
54+
"""Example of scraping a website with specific instructions."""
55+
# Initialize the ScrapeGraphAI reader
56+
scraper = ScrapeGraphAI(api_key=api_key)
57+
58+
try:
59+
# Scrape a website with specific instructions
60+
website_url = "https://example.com"
61+
instructions = """
62+
Extract the following information:
63+
1. Main title of the page
64+
2. All paragraph texts
65+
3. Any links to other pages
66+
"""
67+
68+
result = scraper.scrape(
69+
website_url=website_url,
70+
user_prompt=instructions
71+
)
72+
73+
print("\nScraping Results:")
74+
print(f"Request ID: {result.get('request_id', 'No ID')}")
75+
print("Extracted Data:")
76+
print(result.get('result', {}))
77+
78+
return result
79+
finally:
80+
# Always close the connection
81+
scraper.close()
82+
83+
84+
def main():
85+
# Get API key from environment variable or use a placeholder
86+
api_key = os.environ.get("SCRAPEGRAPH_API_KEY", "your_api_key_here")
87+
88+
if api_key == "your_api_key_here":
89+
print("Please set your SCRAPEGRAPH_API_KEY environment variable")
90+
return
91+
92+
print("Running search example...")
93+
search_example(api_key)
94+
95+
print("\nRunning scrape example...")
96+
scrape_example(api_key)
97+
98+
99+
if __name__ == "__main__":
100+
main()

pyproject.toml

Lines changed: 2 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -100,7 +100,8 @@ web_tools = [
100100
"playwright>=1.50.0",
101101
"html2text>=2024.2.26",
102102
"beautifulsoup4>=4,<5",
103-
"exa-py>=1.10.0,<2"
103+
"exa-py>=1.10.0,<2",
104+
"scrapegraph-py>=1.12.0,<2",
104105
]
105106
document_tools = [
106107
"numpy~=1.26",
Lines changed: 105 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,105 @@
1+
# ========= Copyright 2023-2025@ CAMEL-AI.org. All Rights Reserved. =========
2+
# Licensed under the Apache License, Version 2.0 (the "License");
3+
# you may not use this file except in compliance with the License.
4+
# You may obtain a copy of the License at
5+
#
6+
# http://www.apache.org/licenses/LICENSE-2.0
7+
#
8+
# Unless required by applicable law or agreed to in writing, software
9+
# distributed under the License is distributed on an "AS IS" BASIS,
10+
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
11+
# See the License for the specific language governing permissions and
12+
# limitations under the License.
13+
# ========= Copyright 2023-2025 @ CAMEL-AI.org. All Rights Reserved. =========
14+
15+
import os
16+
from unittest.mock import MagicMock, patch
17+
18+
import pytest
19+
20+
from camel.loaders.scrapegraph_reader import ScrapeGraphAI
21+
22+
23+
@pytest.fixture
24+
def scrapegraph_ai():
25+
with patch("camel.loaders.scrapegraph_reader.Client") as mock_client:
26+
mock_client_instance = MagicMock()
27+
mock_client.return_value = mock_client_instance
28+
yield ScrapeGraphAI(api_key="test_api_key")
29+
30+
31+
def test_init_with_api_key():
32+
with patch("camel.loaders.scrapegraph_reader.Client") as mock_client:
33+
ScrapeGraphAI(api_key="test_api_key")
34+
mock_client.assert_called_once_with(api_key="test_api_key")
35+
36+
37+
def test_init_with_env_var():
38+
with patch("camel.loaders.scrapegraph_reader.Client") as mock_client, \
39+
patch.dict(os.environ, {"SCRAPEGRAPH_API_KEY": "env_api_key"}):
40+
ScrapeGraphAI()
41+
mock_client.assert_called_once_with(api_key="env_api_key")
42+
43+
44+
def test_search_success(scrapegraph_ai):
45+
mock_response = {"answer": "test answer", "references": ["url1", "url2"]}
46+
scrapegraph_ai.client.searchscraper.return_value = mock_response
47+
48+
result = scrapegraph_ai.search("test query")
49+
assert result == mock_response
50+
scrapegraph_ai.client.searchscraper.assert_called_once_with(user_prompt="test query")
51+
52+
53+
def test_search_failure(scrapegraph_ai):
54+
scrapegraph_ai.client.searchscraper.side_effect = Exception("Search failed")
55+
56+
with pytest.raises(RuntimeError, match="Failed to perform search: Search failed"):
57+
scrapegraph_ai.search("test query")
58+
59+
60+
def test_scrape_success(scrapegraph_ai):
61+
mock_response = {"request_id": "123", "result": {"data": "test data"}}
62+
scrapegraph_ai.client.smartscraper.return_value = mock_response
63+
64+
result = scrapegraph_ai.scrape(
65+
website_url="https://example.com",
66+
user_prompt="Extract title and description"
67+
)
68+
assert result == mock_response
69+
scrapegraph_ai.client.smartscraper.assert_called_once_with(
70+
website_url="https://example.com",
71+
user_prompt="Extract title and description",
72+
website_html=None
73+
)
74+
75+
76+
def test_scrape_with_html(scrapegraph_ai):
77+
mock_response = {"request_id": "123", "result": {"data": "test data"}}
78+
scrapegraph_ai.client.smartscraper.return_value = mock_response
79+
80+
result = scrapegraph_ai.scrape(
81+
website_url="https://example.com",
82+
user_prompt="Extract title and description",
83+
website_html="<html>test</html>"
84+
)
85+
assert result == mock_response
86+
scrapegraph_ai.client.smartscraper.assert_called_once_with(
87+
website_url="https://example.com",
88+
user_prompt="Extract title and description",
89+
website_html="<html>test</html>"
90+
)
91+
92+
93+
def test_scrape_failure(scrapegraph_ai):
94+
scrapegraph_ai.client.smartscraper.side_effect = Exception("Scrape failed")
95+
96+
with pytest.raises(RuntimeError, match="Failed to perform scrape: Scrape failed"):
97+
scrapegraph_ai.scrape(
98+
website_url="https://example.com",
99+
user_prompt="Extract title and description"
100+
)
101+
102+
103+
def test_close(scrapegraph_ai):
104+
scrapegraph_ai.close()
105+
scrapegraph_ai.client.close.assert_called_once()

0 commit comments

Comments
 (0)