fix: enhance and upadte errors in craw4ai (camel-ai#2147)

Wendong-Fan · web-flow · commit 0d8df210f4f7 · 2025-04-09T07:16:52.000+08:00
diff --git a/camel/loaders/crawl4ai_reader.py b/camel/loaders/crawl4ai_reader.py
@@ -14,15 +14,11 @@
 
 import asyncio
 import logging
-from typing import Any, Dict, List, Optional
+from typing import Any, Dict, List, Optional, Set
 
-from dotenv import load_dotenv
 from pydantic import BaseModel, ValidationError
 
-from camel.utils import api_keys_required
-
 logger = logging.getLogger(__name__)
-load_dotenv()
 
 
 class Crawl4AI:
@@ -31,17 +27,14 @@ class Crawl4AI:
     This class uses asynchronous crawling with CSS selectors or LLM-based
     extraction to convert entire websites into structured data.
 
-    Args:
-        None: No parameters are required for initialization.
-
     References:
         https://docs.crawl4ai.com/
     """
 
     def __init__(self) -> None:
         from crawl4ai import AsyncWebCrawler
 
-        self.crawler = AsyncWebCrawler
+        self.crawler_class = AsyncWebCrawler
 
     async def _run_crawler(self, url: str, **kwargs) -> Any:
         r"""Run the asynchronous web crawler on a given URL.
@@ -58,7 +51,7 @@ async def _run_crawler(self, url: str, **kwargs) -> Any:
         """
 
         try:
-            async with self.crawler() as c:
+            async with self.crawler_class() as c:
                 return await c.arun(url, **kwargs)
         except Exception as e:
             logger.error("Crawler run failed: %s", e)
@@ -76,9 +69,9 @@ async def crawl(
         Args:
             start_url (str): URL to start crawling from.
             max_depth (int, optional): Maximum depth of links to follow
-                (default: 1).
+                (default: :obj:`1`)
             extraction_strategy (ExtractionStrategy, optional): Strategy
-                for data extraction.
+                for data extraction. (default: :obj:`None`)
             **kwargs: Additional arguments for crawler configuration.
 
         Returns:
@@ -89,8 +82,8 @@ async def crawl(
         """
 
         all_results: List[Dict[str, Any]] = []
-        visited_urls: set[str] = set()
-        queue: asyncio.Queue[tuple[str, int]] = asyncio.Queue()
+        visited_urls: Set[str] = set()
+        queue: asyncio.Queue = asyncio.Queue()
 
         await queue.put((start_url, 1))
         visited_urls.add(start_url)
@@ -114,7 +107,10 @@ async def crawl(
                 if depth < max_depth and result.links:
                     for _, links in result.links.items():
                         for link in links:
-                            if link['href'] not in visited_urls:
+                            if (
+                                'href' in link
+                                and link['href'] not in visited_urls
+                            ):
                                 visited_urls.add(link['href'])
                                 await queue.put((link['href'], depth + 1))
 
@@ -139,7 +135,7 @@ async def scrape(
         Args:
             url (str): URL to scrape.
             extraction_strategy (ExtractionStrategy, optional): Extraction
-                strategy to use.
+                strategy to use. (default: :obj:`None`)
             **kwargs: Additional arguments for crawler configuration.
 
         Returns:
@@ -161,15 +157,6 @@ async def scrape(
             "links": result.links,
         }
 
-    @api_keys_required(
-        [
-            # ("api_key", "ANTHROPIC_API_KEY"),
-            # ("api_key", "DEEPSEEK_API_KEY"),
-            # ("api_key", "GEMINI_API_KEY"),
-            # ("api_key", "GROQ_API_KEY"),
-            ("api_key", "OPENAI_API_KEY"),
-        ]
-    )
     async def structured_scrape(
         self,
         url: str,
@@ -185,9 +172,9 @@ async def structured_scrape(
             response_format (BaseModel): Model defining the expected output
                 schema.
             api_key (str, optional): API key for the LLM provider
-                (default: "no-token").
+                (default: :obj:`None`).
             llm_provider (str, optional): Identifier for the LLM provider
-                (default: 'ollama/llama3').
+                (default: :obj:`'ollama/llama3'`).
             **kwargs: Additional arguments for crawler configuration.
 
         Returns:
@@ -222,11 +209,11 @@ async def structured_scrape(
         except Exception as e:
             raise RuntimeError(e) from e
 
-    async def map_site(self, url: str, **kwargs) -> List[str]:
+    async def map_site(self, start_url: str, **kwargs) -> List[str]:
         r"""Map a website by extracting all accessible URLs.
 
         Args:
-            url (str): Starting URL to map.
+            start_url (str): Starting URL to map.
             **kwargs: Additional configuration arguments.
 
         Returns:
@@ -237,7 +224,7 @@ async def map_site(self, url: str, **kwargs) -> List[str]:
         """
 
         try:
-            result = await self.crawl(url, **kwargs)
+            result = await self.crawl(start_url, **kwargs)
             return [page["url"] for page in result]
         except Exception as e:
             raise RuntimeError(f"Failed to map url: {e}") from e
diff --git a/examples/loaders/crawl4ai_example.py b/examples/loaders/crawl4ai_example.py
@@ -27,6 +27,7 @@
 crawl_results = asyncio.run(crawler.crawl(URL, max_depth=1))
 print(crawl_results[0]["markdown"])
 '''
+===============================================================================
 ![](img/zyte.png)
 
 # Web Scraping Sandbox
@@ -69,20 +70,23 @@
 [ViewState](http://quotes.toscrape.com/search.aspx) | an AJAX based filter 
     form with ViewStates  
 [Random](http://quotes.toscrape.com/random) | a single random quote
+===============================================================================
 '''
 
 
 # --- Scraping ---
 scrape_result = asyncio.run(crawler.scrape(URL))
 print(scrape_result)
 '''
+===============================================================================
 {url: 'https://toscrape.com/', 
 'raw_result': CrawlResult(url='https://toscrape.com/', markdown=..., 
     cleaned_html=..., links=...),
 'markdown': "![](img/zyte.png)\n\n# Web Scraping Sandbox\n\n## Books...", 
 'cleaned_html': '\n<div>\n<div>\n<div>\n<img class="logo" height="108" 
     src="img/zyte.png" width="200"/>\n
     <h1>Web Scraping Sandbox</h1>\n...'}
+===============================================================================
 '''
 
 
diff --git a/pyproject.toml b/pyproject.toml
@@ -30,7 +30,6 @@ dependencies = [
     "pydantic>=2.10.6",
     "httpx>=0.28.0,<1.0.0dev",
     "psutil>=5.9.8,<6",
-    "crawl4ai>=0.3.745",
 ]
 
 [project.optional-dependencies]
@@ -77,6 +76,7 @@ rag = [
     "cohere>=5.11.0,<6",
     "unstructured==0.16.20",
     "pandasai>=2.3.0,<3",
+    "crawl4ai>=0.3.745",
 ]
 web_tools = [
     "duckduckgo-search>=6.3.5,<7",
@@ -112,6 +112,7 @@ document_tools = [
     "openpyxl>=3.1.5",
     "docx>=0.2.4",
     "fpdf>=1.7.2",
+    "crawl4ai>=0.3.745",
 ]
 media_tools = [
     "imageio[pyav]>=2.34.2,<3",
@@ -231,6 +232,7 @@ owl = [
     "tree-sitter>=0.23.2,<0.24",
     "pandas>=1.5.3,<2",
     "rouge>=1.0.1,<2",
+    "crawl4ai>=0.3.745",
 ]
 all = [
     "numpy~=1.26",
@@ -336,7 +338,8 @@ all = [
     "typer>=0.15.2",
     "mem0ai>=0.1.67",
     "math-verify>=0.7.0,<0.8",
-    "exa-py>=1.10.0,<2"
+    "exa-py>=1.10.0,<2",
+    "crawl4ai>=0.3.745",
 ]
 
 [project.urls]
@@ -516,7 +519,8 @@ module = [
     "typer",
     "mem0",
     "math_verify.*",
-    "exa_py"
+    "exa_py",
+    "crawl4ai.*",
 ]
 ignore_missing_imports = true
 
diff --git a/test/loaders/test_crawl4ai.py b/test/loaders/test_crawl4ai.py
@@ -42,7 +42,7 @@ def __init__(
 def test_init():
     crawler = Crawl4AI()
     # Verify that the crawler attribute is set to AsyncWebCrawler
-    assert crawler.crawler is not None
+    assert crawler.crawl is not None
 
 
 @pytest.mark.asyncio
diff --git a/uv.lock b/uv.lock