1414
1515import asyncio
1616import logging
17- from typing import Any , Dict , List , Optional
17+ from typing import Any , Dict , List , Optional , Set
1818
19- from dotenv import load_dotenv
2019from pydantic import BaseModel , ValidationError
2120
22- from camel .utils import api_keys_required
23-
2421logger = logging .getLogger (__name__ )
25- load_dotenv ()
2622
2723
2824class Crawl4AI :
@@ -31,17 +27,14 @@ class Crawl4AI:
3127 This class uses asynchronous crawling with CSS selectors or LLM-based
3228 extraction to convert entire websites into structured data.
3329
34- Args:
35- None: No parameters are required for initialization.
36-
3730 References:
3831 https://docs.crawl4ai.com/
3932 """
4033
4134 def __init__ (self ) -> None :
4235 from crawl4ai import AsyncWebCrawler
4336
44- self .crawler = AsyncWebCrawler
37+ self .crawler_class = AsyncWebCrawler
4538
4639 async def _run_crawler (self , url : str , ** kwargs ) -> Any :
4740 r"""Run the asynchronous web crawler on a given URL.
@@ -58,7 +51,7 @@ async def _run_crawler(self, url: str, **kwargs) -> Any:
5851 """
5952
6053 try :
61- async with self .crawler () as c :
54+ async with self .crawler_class () as c :
6255 return await c .arun (url , ** kwargs )
6356 except Exception as e :
6457 logger .error ("Crawler run failed: %s" , e )
@@ -76,9 +69,9 @@ async def crawl(
7669 Args:
7770 start_url (str): URL to start crawling from.
7871 max_depth (int, optional): Maximum depth of links to follow
79- (default: 1).
72+ (default: :obj:`1`)
8073 extraction_strategy (ExtractionStrategy, optional): Strategy
81- for data extraction.
74+ for data extraction. (default: :obj:`None`)
8275 **kwargs: Additional arguments for crawler configuration.
8376
8477 Returns:
@@ -89,8 +82,8 @@ async def crawl(
8982 """
9083
9184 all_results : List [Dict [str , Any ]] = []
92- visited_urls : set [str ] = set ()
93- queue : asyncio .Queue [ tuple [ str , int ]] = asyncio .Queue ()
85+ visited_urls : Set [str ] = set ()
86+ queue : asyncio .Queue = asyncio .Queue ()
9487
9588 await queue .put ((start_url , 1 ))
9689 visited_urls .add (start_url )
@@ -114,7 +107,10 @@ async def crawl(
114107 if depth < max_depth and result .links :
115108 for _ , links in result .links .items ():
116109 for link in links :
117- if link ['href' ] not in visited_urls :
110+ if (
111+ 'href' in link
112+ and link ['href' ] not in visited_urls
113+ ):
118114 visited_urls .add (link ['href' ])
119115 await queue .put ((link ['href' ], depth + 1 ))
120116
@@ -139,7 +135,7 @@ async def scrape(
139135 Args:
140136 url (str): URL to scrape.
141137 extraction_strategy (ExtractionStrategy, optional): Extraction
142- strategy to use.
138+ strategy to use. (default: :obj:`None`)
143139 **kwargs: Additional arguments for crawler configuration.
144140
145141 Returns:
@@ -161,15 +157,6 @@ async def scrape(
161157 "links" : result .links ,
162158 }
163159
164- @api_keys_required (
165- [
166- # ("api_key", "ANTHROPIC_API_KEY"),
167- # ("api_key", "DEEPSEEK_API_KEY"),
168- # ("api_key", "GEMINI_API_KEY"),
169- # ("api_key", "GROQ_API_KEY"),
170- ("api_key" , "OPENAI_API_KEY" ),
171- ]
172- )
173160 async def structured_scrape (
174161 self ,
175162 url : str ,
@@ -185,9 +172,9 @@ async def structured_scrape(
185172 response_format (BaseModel): Model defining the expected output
186173 schema.
187174 api_key (str, optional): API key for the LLM provider
188- (default: "no-token" ).
175+ (default: :obj:`None` ).
189176 llm_provider (str, optional): Identifier for the LLM provider
190- (default: 'ollama/llama3').
177+ (default: :obj:` 'ollama/llama3'` ).
191178 **kwargs: Additional arguments for crawler configuration.
192179
193180 Returns:
@@ -222,11 +209,11 @@ async def structured_scrape(
222209 except Exception as e :
223210 raise RuntimeError (e ) from e
224211
225- async def map_site (self , url : str , ** kwargs ) -> List [str ]:
212+ async def map_site (self , start_url : str , ** kwargs ) -> List [str ]:
226213 r"""Map a website by extracting all accessible URLs.
227214
228215 Args:
229- url (str): Starting URL to map.
216+ start_url (str): Starting URL to map.
230217 **kwargs: Additional configuration arguments.
231218
232219 Returns:
@@ -237,7 +224,7 @@ async def map_site(self, url: str, **kwargs) -> List[str]:
237224 """
238225
239226 try :
240- result = await self .crawl (url , ** kwargs )
227+ result = await self .crawl (start_url , ** kwargs )
241228 return [page ["url" ] for page in result ]
242229 except Exception as e :
243230 raise RuntimeError (f"Failed to map url: { e } " ) from e
0 commit comments