diff --git a/docs/guides/code_examples/playwright_crawler/extending-playwright-browser-plugin.py b/docs/guides/code_examples/playwright_crawler/extending-playwright-browser-plugin.py new file mode 100644 index 0000000000..0845e84957 --- /dev/null +++ b/docs/guides/code_examples/playwright_crawler/extending-playwright-browser-plugin.py @@ -0,0 +1,44 @@ +import asyncio + +from crawlee.browsers import BrowserPool, PlaywrightBrowserController, PlaywrightBrowserPlugin +from crawlee.crawlers import PlaywrightCrawler, PlaywrightCrawlingContext +from crawlee._utils.context import ensure_context +from typing_extensions import override + + +class CustomBrowserPlugin(PlaywrightBrowserPlugin): + """A custom browser plugin that launches a browser from a custom executable path.""" + + def __init__(self, executable_path: str, **kwargs: object) -> None: + super().__init__(**kwargs) + self._executable_path = executable_path + + @ensure_context + @override + async def new_browser(self) -> PlaywrightBrowserController: + if not self._playwright: + raise RuntimeError('Playwright browser plugin is not initialized.') + + browser = await self._playwright.chromium.launch( + executable_path=self._executable_path, + headless=True, + ) + return PlaywrightBrowserController( + browser=browser, + max_open_pages_per_browser=self.max_open_pages_per_browser, + ) + + +async def main() -> None: + plugin = CustomBrowserPlugin(executable_path='/path/to/custom/browser') + browser_pool = BrowserPool(plugins=[plugin]) + crawler = PlaywrightCrawler(browser_pool=browser_pool) + + @crawler.router.default_handler + async def handler(context: PlaywrightCrawlingContext) -> None: + context.log.info(f'Crawling: {context.request.url}') + + await crawler.run(['https://crawlee.dev']) + + +asyncio.run(main()) \ No newline at end of file diff --git a/docs/guides/playwright_crawler.mdx b/docs/guides/playwright_crawler.mdx index effa8e99c2..3bc5dc3626 100644 --- a/docs/guides/playwright_crawler.mdx +++ b/docs/guides/playwright_crawler.mdx @@ -88,4 +88,18 @@ Navigation hooks allow for additional configuration at specific points during pa ## Conclusion -This guide introduced the `PlaywrightCrawler` and explained how to configure it using `BrowserPool` and `PlaywrightBrowserPlugin`. You learned how to launch multiple browsers, configure browser and context settings, use `BrowserPool` lifecycle hooks, and apply navigation hooks. If you have questions or need assistance, feel free to reach out on our [GitHub](https://github.com/apify/crawlee-python) or join our [Discord community](https://discord.com/invite/jyEM2PRvMU). Happy scraping! +## Extending the browser plugin + +For full control over browser launching, you can subclass `PlaywrightBrowserPlugin` and override its `new_browser` method. This lets you integrate any Playwright-compatible browser backend — such as a custom Chromium build, a stealth browser, or a browser with a persistent profile. + +The overridden `new_browser` method must return a `PlaywrightBrowserController` instance wrapping your custom browser. Pass your plugin to `BrowserPool`, which you then provide to `PlaywrightCrawler` via the `browser_pool` argument. + + + {ExtendingPluginExample} + + +For a real-world example of a custom browser plugin, see the [Camoufox example](../examples/playwright-crawler-with-camoufox). + +:::note +Third-party projects that provide alternative browser backends for Crawlee can link to this section as the canonical reference for plugin subclassing. +::: \ No newline at end of file