Skip to content

Commit 811aa11

Browse files
committed
added capability to get existing contexts for use by scraper
1 parent a1618db commit 811aa11

File tree

1 file changed

+53
-0
lines changed

1 file changed

+53
-0
lines changed

scrapy_playwright/handler.py

Lines changed: 53 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -83,6 +83,7 @@ def __init__(self, crawler: Crawler) -> None:
8383
) or settings.getint("CONCURRENT_REQUESTS")
8484
self.context_launch_lock = asyncio.Lock()
8585
self.context_wrappers: Dict[str, BrowserContextWrapper] = {}
86+
self.existing_context: bool = settings.getbool("PLAYWRIGHT_EXISTING_CONTEXT")
8687
self.startup_context_kwargs: dict = settings.getdict("PLAYWRIGHT_CONTEXTS")
8788
if settings.getint("PLAYWRIGHT_MAX_CONTEXTS"):
8889
self.context_semaphore = asyncio.Semaphore(
@@ -136,6 +137,16 @@ async def _launch(self) -> None:
136137
self._set_max_concurrent_context_count()
137138
logger.info("Startup context(s) launched")
138139
self.stats.set_value("playwright/page_count", self._get_total_page_count())
140+
if self.existing_context and self.browser_cdp_url:
141+
logger.info("Getting existing context(s)")
142+
if not hasattr(self, "browser"):
143+
await self._maybe_connect_devtools()
144+
await asyncio.gather(
145+
*[self._get_existing_browser_context(index=i) for i in range(len(self.browser.contexts))]
146+
)
147+
self._set_max_concurrent_context_count()
148+
logger.info("Existing context(s) retrieved")
149+
self.stats.set_value("playwright/page_count", self._get_total_page_count())
139150
del self.startup_context_kwargs
140151

141152
async def _maybe_launch_browser(self) -> None:
@@ -209,6 +220,48 @@ async def _create_browser_context(
209220
self._set_max_concurrent_context_count()
210221
return self.context_wrappers[name]
211222

223+
async def _get_existing_browser_context(self, index: int = 0,
224+
spider: Optional[Spider] = None,
225+
) -> BrowserContextWrapper | None:
226+
if not self.browser_cdp_url or len(self.browser.contexts) <= index - 1:
227+
return None
228+
if not hasattr(self, "browser"):
229+
await self._maybe_connect_devtools()
230+
if hasattr(self, "context_semaphore"):
231+
await self.context_semaphore.acquire()
232+
name = f"existing_context_{index}"
233+
context = self.browser.contexts[index]
234+
persistent = False
235+
remote = True
236+
237+
context.on(
238+
"close", self._make_close_browser_context_callback(name, persistent, remote, spider)
239+
)
240+
self.stats.inc_value("playwright/context_count")
241+
self.stats.inc_value(f"playwright/context_count/persistent/{persistent}")
242+
self.stats.inc_value(f"playwright/context_count/remote/{remote}")
243+
logger.debug(
244+
"Browser context started: '%s' (persistent=%s, remote=%s)",
245+
name,
246+
persistent,
247+
remote,
248+
extra={
249+
"spider": spider,
250+
"context_name": name,
251+
"persistent": persistent,
252+
"remote": remote,
253+
},
254+
)
255+
if self.default_navigation_timeout is not None:
256+
context.set_default_navigation_timeout(self.default_navigation_timeout)
257+
self.context_wrappers[name] = BrowserContextWrapper(
258+
context=context,
259+
semaphore=asyncio.Semaphore(value=self.max_pages_per_context),
260+
persistent=persistent,
261+
)
262+
self._set_max_concurrent_context_count()
263+
return self.context_wrappers[name]
264+
212265
async def _create_page(self, request: Request, spider: Spider) -> Page:
213266
"""Create a new page in a context, also creating a new context if necessary."""
214267
context_name = request.meta.setdefault("playwright_context", DEFAULT_CONTEXT_NAME)

0 commit comments

Comments
 (0)