Skip to content

Commit 0d8df21

Browse files
authored
fix: enhance and upadte errors in craw4ai (camel-ai#2147)
1 parent bbd63fb commit 0d8df21

File tree

5 files changed

+1062
-1043
lines changed

5 files changed

+1062
-1043
lines changed

camel/loaders/crawl4ai_reader.py

Lines changed: 17 additions & 30 deletions
Original file line numberDiff line numberDiff line change
@@ -14,15 +14,11 @@
1414

1515
import asyncio
1616
import logging
17-
from typing import Any, Dict, List, Optional
17+
from typing import Any, Dict, List, Optional, Set
1818

19-
from dotenv import load_dotenv
2019
from pydantic import BaseModel, ValidationError
2120

22-
from camel.utils import api_keys_required
23-
2421
logger = logging.getLogger(__name__)
25-
load_dotenv()
2622

2723

2824
class Crawl4AI:
@@ -31,17 +27,14 @@ class Crawl4AI:
3127
This class uses asynchronous crawling with CSS selectors or LLM-based
3228
extraction to convert entire websites into structured data.
3329
34-
Args:
35-
None: No parameters are required for initialization.
36-
3730
References:
3831
https://docs.crawl4ai.com/
3932
"""
4033

4134
def __init__(self) -> None:
4235
from crawl4ai import AsyncWebCrawler
4336

44-
self.crawler = AsyncWebCrawler
37+
self.crawler_class = AsyncWebCrawler
4538

4639
async def _run_crawler(self, url: str, **kwargs) -> Any:
4740
r"""Run the asynchronous web crawler on a given URL.
@@ -58,7 +51,7 @@ async def _run_crawler(self, url: str, **kwargs) -> Any:
5851
"""
5952

6053
try:
61-
async with self.crawler() as c:
54+
async with self.crawler_class() as c:
6255
return await c.arun(url, **kwargs)
6356
except Exception as e:
6457
logger.error("Crawler run failed: %s", e)
@@ -76,9 +69,9 @@ async def crawl(
7669
Args:
7770
start_url (str): URL to start crawling from.
7871
max_depth (int, optional): Maximum depth of links to follow
79-
(default: 1).
72+
(default: :obj:`1`)
8073
extraction_strategy (ExtractionStrategy, optional): Strategy
81-
for data extraction.
74+
for data extraction. (default: :obj:`None`)
8275
**kwargs: Additional arguments for crawler configuration.
8376
8477
Returns:
@@ -89,8 +82,8 @@ async def crawl(
8982
"""
9083

9184
all_results: List[Dict[str, Any]] = []
92-
visited_urls: set[str] = set()
93-
queue: asyncio.Queue[tuple[str, int]] = asyncio.Queue()
85+
visited_urls: Set[str] = set()
86+
queue: asyncio.Queue = asyncio.Queue()
9487

9588
await queue.put((start_url, 1))
9689
visited_urls.add(start_url)
@@ -114,7 +107,10 @@ async def crawl(
114107
if depth < max_depth and result.links:
115108
for _, links in result.links.items():
116109
for link in links:
117-
if link['href'] not in visited_urls:
110+
if (
111+
'href' in link
112+
and link['href'] not in visited_urls
113+
):
118114
visited_urls.add(link['href'])
119115
await queue.put((link['href'], depth + 1))
120116

@@ -139,7 +135,7 @@ async def scrape(
139135
Args:
140136
url (str): URL to scrape.
141137
extraction_strategy (ExtractionStrategy, optional): Extraction
142-
strategy to use.
138+
strategy to use. (default: :obj:`None`)
143139
**kwargs: Additional arguments for crawler configuration.
144140
145141
Returns:
@@ -161,15 +157,6 @@ async def scrape(
161157
"links": result.links,
162158
}
163159

164-
@api_keys_required(
165-
[
166-
# ("api_key", "ANTHROPIC_API_KEY"),
167-
# ("api_key", "DEEPSEEK_API_KEY"),
168-
# ("api_key", "GEMINI_API_KEY"),
169-
# ("api_key", "GROQ_API_KEY"),
170-
("api_key", "OPENAI_API_KEY"),
171-
]
172-
)
173160
async def structured_scrape(
174161
self,
175162
url: str,
@@ -185,9 +172,9 @@ async def structured_scrape(
185172
response_format (BaseModel): Model defining the expected output
186173
schema.
187174
api_key (str, optional): API key for the LLM provider
188-
(default: "no-token").
175+
(default: :obj:`None`).
189176
llm_provider (str, optional): Identifier for the LLM provider
190-
(default: 'ollama/llama3').
177+
(default: :obj:`'ollama/llama3'`).
191178
**kwargs: Additional arguments for crawler configuration.
192179
193180
Returns:
@@ -222,11 +209,11 @@ async def structured_scrape(
222209
except Exception as e:
223210
raise RuntimeError(e) from e
224211

225-
async def map_site(self, url: str, **kwargs) -> List[str]:
212+
async def map_site(self, start_url: str, **kwargs) -> List[str]:
226213
r"""Map a website by extracting all accessible URLs.
227214
228215
Args:
229-
url (str): Starting URL to map.
216+
start_url (str): Starting URL to map.
230217
**kwargs: Additional configuration arguments.
231218
232219
Returns:
@@ -237,7 +224,7 @@ async def map_site(self, url: str, **kwargs) -> List[str]:
237224
"""
238225

239226
try:
240-
result = await self.crawl(url, **kwargs)
227+
result = await self.crawl(start_url, **kwargs)
241228
return [page["url"] for page in result]
242229
except Exception as e:
243230
raise RuntimeError(f"Failed to map url: {e}") from e

examples/loaders/crawl4ai_example.py

Lines changed: 4 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -27,6 +27,7 @@
2727
crawl_results = asyncio.run(crawler.crawl(URL, max_depth=1))
2828
print(crawl_results[0]["markdown"])
2929
'''
30+
===============================================================================
3031
![](img/zyte.png)
3132
3233
# Web Scraping Sandbox
@@ -69,20 +70,23 @@
6970
[ViewState](http://quotes.toscrape.com/search.aspx) | an AJAX based filter
7071
form with ViewStates
7172
[Random](http://quotes.toscrape.com/random) | a single random quote
73+
===============================================================================
7274
'''
7375

7476

7577
# --- Scraping ---
7678
scrape_result = asyncio.run(crawler.scrape(URL))
7779
print(scrape_result)
7880
'''
81+
===============================================================================
7982
{url: 'https://toscrape.com/',
8083
'raw_result': CrawlResult(url='https://toscrape.com/', markdown=...,
8184
cleaned_html=..., links=...),
8285
'markdown': "![](img/zyte.png)\n\n# Web Scraping Sandbox\n\n## Books...",
8386
'cleaned_html': '\n<div>\n<div>\n<div>\n<img class="logo" height="108"
8487
src="img/zyte.png" width="200"/>\n
8588
<h1>Web Scraping Sandbox</h1>\n...'}
89+
===============================================================================
8690
'''
8791

8892

pyproject.toml

Lines changed: 7 additions & 3 deletions
Original file line numberDiff line numberDiff line change
@@ -30,7 +30,6 @@ dependencies = [
3030
"pydantic>=2.10.6",
3131
"httpx>=0.28.0,<1.0.0dev",
3232
"psutil>=5.9.8,<6",
33-
"crawl4ai>=0.3.745",
3433
]
3534

3635
[project.optional-dependencies]
@@ -77,6 +76,7 @@ rag = [
7776
"cohere>=5.11.0,<6",
7877
"unstructured==0.16.20",
7978
"pandasai>=2.3.0,<3",
79+
"crawl4ai>=0.3.745",
8080
]
8181
web_tools = [
8282
"duckduckgo-search>=6.3.5,<7",
@@ -112,6 +112,7 @@ document_tools = [
112112
"openpyxl>=3.1.5",
113113
"docx>=0.2.4",
114114
"fpdf>=1.7.2",
115+
"crawl4ai>=0.3.745",
115116
]
116117
media_tools = [
117118
"imageio[pyav]>=2.34.2,<3",
@@ -231,6 +232,7 @@ owl = [
231232
"tree-sitter>=0.23.2,<0.24",
232233
"pandas>=1.5.3,<2",
233234
"rouge>=1.0.1,<2",
235+
"crawl4ai>=0.3.745",
234236
]
235237
all = [
236238
"numpy~=1.26",
@@ -336,7 +338,8 @@ all = [
336338
"typer>=0.15.2",
337339
"mem0ai>=0.1.67",
338340
"math-verify>=0.7.0,<0.8",
339-
"exa-py>=1.10.0,<2"
341+
"exa-py>=1.10.0,<2",
342+
"crawl4ai>=0.3.745",
340343
]
341344

342345
[project.urls]
@@ -516,7 +519,8 @@ module = [
516519
"typer",
517520
"mem0",
518521
"math_verify.*",
519-
"exa_py"
522+
"exa_py",
523+
"crawl4ai.*",
520524
]
521525
ignore_missing_imports = true
522526

test/loaders/test_crawl4ai.py

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -42,7 +42,7 @@ def __init__(
4242
def test_init():
4343
crawler = Crawl4AI()
4444
# Verify that the crawler attribute is set to AsyncWebCrawler
45-
assert crawler.crawler is not None
45+
assert crawler.crawl is not None
4646

4747

4848
@pytest.mark.asyncio

0 commit comments

Comments
 (0)