Spaces:

JimLin0704
/

Crawl4AI

Sleeping

Crawl4AI / crawl4ai /async_crawler_strategy.py

amaye15

test

03c0888 10 months ago

87.8 kB

	import asyncio
	import base64
	import time
	from abc import ABC, abstractmethod
	from typing import Callable, Dict, Any, List, Optional, Awaitable, Union
	import os, sys, shutil
	import tempfile, subprocess
	from playwright.async_api import async_playwright, Page, Browser, Error, BrowserContext
	from playwright.async_api import TimeoutError as PlaywrightTimeoutError
	from io import BytesIO
	from PIL import Image, ImageDraw, ImageFont
	from pathlib import Path
	from playwright.async_api import ProxySettings
	from pydantic import BaseModel
	import hashlib
	import json
	import uuid
	from .js_snippet import load_js_script
	from .models import AsyncCrawlResponse
	from .utils import get_error_context
	from .user_agent_generator import UserAgentGenerator
	from .config import SCREENSHOT_HEIGHT_TRESHOLD, DOWNLOAD_PAGE_TIMEOUT
	from .async_configs import BrowserConfig, CrawlerRunConfig
	from .async_logger import AsyncLogger
	from playwright_stealth import StealthConfig, stealth_async
	from .ssl_certificate import SSLCertificate

	stealth_config = StealthConfig(
	webdriver=True,
	chrome_app=True,
	chrome_csi=True,
	chrome_load_times=True,
	chrome_runtime=True,
	navigator_languages=True,
	navigator_plugins=True,
	navigator_permissions=True,
	webgl_vendor=True,
	outerdimensions=True,
	navigator_hardware_concurrency=True,
	media_codecs=True,
	)

	BROWSER_DISABLE_OPTIONS = [
	"--disable-background-networking",
	"--disable-background-timer-throttling",
	"--disable-backgrounding-occluded-windows",
	"--disable-breakpad",
	"--disable-client-side-phishing-detection",
	"--disable-component-extensions-with-background-pages",
	"--disable-default-apps",
	"--disable-extensions",
	"--disable-features=TranslateUI",
	"--disable-hang-monitor",
	"--disable-ipc-flooding-protection",
	"--disable-popup-blocking",
	"--disable-prompt-on-repost",
	"--disable-sync",
	"--force-color-profile=srgb",
	"--metrics-recording-only",
	"--no-first-run",
	"--password-store=basic",
	"--use-mock-keychain",
	]


	class ManagedBrowser:
	"""
	Manages the browser process and context. This class allows to connect to the browser using CDP protocol.

	Attributes:
	browser_type (str): The type of browser to launch. Supported values: "chromium", "firefox", "webkit".
	Default: "chromium".
	user_data_dir (str or None): Path to a user data directory for persistent sessions. If None, a
	temporary directory may be used. Default: None.
	headless (bool): Whether to run the browser in headless mode (no visible GUI).
	Default: True.
	browser_process (subprocess.Popen): The process object for the browser.
	temp_dir (str): Temporary directory for user data if not provided.
	debugging_port (int): Port for debugging the browser.
	host (str): Host for debugging the browser.

	Methods:
	start(): Starts the browser process and returns the CDP endpoint URL.
	_get_browser_path(): Returns the browser executable path based on OS and browser type.
	_get_browser_args(): Returns browser-specific command line arguments.
	_get_user_data_dir(): Returns the user data directory path.
	_cleanup(): Terminates the browser process and removes the temporary directory.
	"""

	browser_type: str
	user_data_dir: str
	headless: bool
	browser_process: subprocess.Popen
	temp_dir: str
	debugging_port: int
	host: str
	def __init__(
	self,
	browser_type: str = "chromium",
	user_data_dir: Optional[str] = None,
	headless: bool = False,
	logger=None,
	host: str = "localhost",
	debugging_port: int = 9222,
	):
	"""
	Initialize the ManagedBrowser instance.

	Args:
	browser_type (str): The type of browser to launch. Supported values: "chromium", "firefox", "webkit".
	Default: "chromium".
	user_data_dir (str or None): Path to a user data directory for persistent sessions. If None, a
	temporary directory may be used. Default: None.
	headless (bool): Whether to run the browser in headless mode (no visible GUI).
	Default: True.
	logger (logging.Logger): Logger instance for logging messages. Default: None.
	host (str): Host for debugging the browser. Default: "localhost".
	debugging_port (int): Port for debugging the browser. Default: 9222.
	"""
	self.browser_type = browser_type
	self.user_data_dir = user_data_dir
	self.headless = headless
	self.browser_process = None
	self.temp_dir = None
	self.debugging_port = debugging_port
	self.host = host
	self.logger = logger
	self.shutting_down = False

	async def start(self) -> str:
	"""
	Starts the browser process and returns the CDP endpoint URL.
	If user_data_dir is not provided, creates a temporary directory.
	"""

	# Create temp dir if needed
	if not self.user_data_dir:
	self.temp_dir = tempfile.mkdtemp(prefix="browser-profile-")
	self.user_data_dir = self.temp_dir

	# Get browser path and args based on OS and browser type
	browser_path = self._get_browser_path()
	args = self._get_browser_args()

	# Start browser process
	try:
	self.browser_process = subprocess.Popen(
	args, stdout=subprocess.PIPE, stderr=subprocess.PIPE
	)
	# Monitor browser process output for errors
	asyncio.create_task(self._monitor_browser_process())
	await asyncio.sleep(2) # Give browser time to start
	return f"http://{self.host}:{self.debugging_port}"
	except Exception as e:
	await self.cleanup()
	raise Exception(f"Failed to start browser: {e}")

	async def _monitor_browser_process(self):
	"""
	Monitor the browser process for unexpected termination.

	How it works:
	1. Read stdout and stderr from the browser process.
	2. If the process has terminated, log the error message and terminate the browser.
	3. If the shutting_down flag is set, log the normal termination message.
	4. If any other error occurs, log the error message.

	Note: This method should be called in a separate task to avoid blocking the main event loop.
	"""
	if self.browser_process:
	try:
	stdout, stderr = await asyncio.gather(
	asyncio.to_thread(self.browser_process.stdout.read),
	asyncio.to_thread(self.browser_process.stderr.read),
	)

	# Check shutting_down flag BEFORE logging anything
	if self.browser_process.poll() is not None:
	if not self.shutting_down:
	self.logger.error(
	message="Browser process terminated unexpectedly \| Code: {code} \| STDOUT: {stdout} \| STDERR: {stderr}",
	tag="ERROR",
	params={
	"code": self.browser_process.returncode,
	"stdout": stdout.decode(),
	"stderr": stderr.decode(),
	},
	)
	await self.cleanup()
	else:
	self.logger.info(
	message="Browser process terminated normally \| Code: {code}",
	tag="INFO",
	params={"code": self.browser_process.returncode},
	)
	except Exception as e:
	if not self.shutting_down:
	self.logger.error(
	message="Error monitoring browser process: {error}",
	tag="ERROR",
	params={"error": str(e)},
	)

	def _get_browser_path(self) -> str:
	"""Returns the browser executable path based on OS and browser type"""
	if sys.platform == "darwin": # macOS
	paths = {
	"chromium": "/Applications/Google Chrome.app/Contents/MacOS/Google Chrome",
	"firefox": "/Applications/Firefox.app/Contents/MacOS/firefox",
	"webkit": "/Applications/Safari.app/Contents/MacOS/Safari",
	}
	elif sys.platform == "win32": # Windows
	paths = {
	"chromium": "C:\\Program Files\\Google\\Chrome\\Application\\chrome.exe",
	"firefox": "C:\\Program Files\\Mozilla Firefox\\firefox.exe",
	"webkit": None, # WebKit not supported on Windows
	}
	else: # Linux
	paths = {
	"chromium": "google-chrome",
	"firefox": "firefox",
	"webkit": None, # WebKit not supported on Linux
	}

	return paths.get(self.browser_type)

	def _get_browser_args(self) -> List[str]:
	"""Returns browser-specific command line arguments"""
	base_args = [self._get_browser_path()]

	if self.browser_type == "chromium":
	args = [
	f"--remote-debugging-port={self.debugging_port}",
	f"--user-data-dir={self.user_data_dir}",
	]
	if self.headless:
	args.append("--headless=new")
	elif self.browser_type == "firefox":
	args = [
	"--remote-debugging-port",
	str(self.debugging_port),
	"--profile",
	self.user_data_dir,
	]
	if self.headless:
	args.append("--headless")
	else:
	raise NotImplementedError(f"Browser type {self.browser_type} not supported")

	return base_args + args

	async def cleanup(self):
	"""Cleanup browser process and temporary directory"""
	# Set shutting_down flag BEFORE any termination actions
	self.shutting_down = True

	if self.browser_process:
	try:
	self.browser_process.terminate()
	# Wait for process to end gracefully
	for _ in range(10): # 10 attempts, 100ms each
	if self.browser_process.poll() is not None:
	break
	await asyncio.sleep(0.1)

	# Force kill if still running
	if self.browser_process.poll() is None:
	self.browser_process.kill()
	await asyncio.sleep(0.1) # Brief wait for kill to take effect

	except Exception as e:
	self.logger.error(
	message="Error terminating browser: {error}",
	tag="ERROR",
	params={"error": str(e)},
	)

	if self.temp_dir and os.path.exists(self.temp_dir):
	try:
	shutil.rmtree(self.temp_dir)
	except Exception as e:
	self.logger.error(
	message="Error removing temporary directory: {error}",
	tag="ERROR",
	params={"error": str(e)},
	)


	class BrowserManager:
	"""
	Manages the browser instance and context.

	Attributes:
	config (BrowserConfig): Configuration object containing all browser settings
	logger: Logger instance for recording events and errors
	browser (Browser): The browser instance
	default_context (BrowserContext): The default browser context
	managed_browser (ManagedBrowser): The managed browser instance
	playwright (Playwright): The Playwright instance
	sessions (dict): Dictionary to store session information
	session_ttl (int): Session timeout in seconds
	"""
	def __init__(self, browser_config: BrowserConfig, logger=None):
	"""
	Initialize the BrowserManager with a browser configuration.

	Args:
	browser_config (BrowserConfig): Configuration object containing all browser settings
	logger: Logger instance for recording events and errors
	"""
	self.config: BrowserConfig = browser_config
	self.logger = logger

	# Browser state
	self.browser = None
	self.default_context = None
	self.managed_browser = None
	self.playwright = None

	# Session management
	self.sessions = {}
	self.session_ttl = 1800 # 30 minutes

	# Initialize ManagedBrowser if needed
	if self.config.use_managed_browser:
	self.managed_browser = ManagedBrowser(
	browser_type=self.config.browser_type,
	user_data_dir=self.config.user_data_dir,
	headless=self.config.headless,
	logger=self.logger,
	debugging_port=self.config.debugging_port,
	)

	async def start(self):
	"""
	Start the browser instance and set up the default context.

	How it works:
	1. Check if Playwright is already initialized.
	2. If not, initialize Playwright.
	3. If managed browser is used, start it and connect to the CDP endpoint.
	4. If managed browser is not used, launch the browser and set up the default context.

	Note: This method should be called in a separate task to avoid blocking the main event loop.
	"""
	if self.playwright is None:
	from playwright.async_api import async_playwright

	self.playwright = await async_playwright().start()

	if self.config.use_managed_browser:
	cdp_url = await self.managed_browser.start()
	self.browser = await self.playwright.chromium.connect_over_cdp(cdp_url)
	contexts = self.browser.contexts
	if contexts:
	self.default_context = contexts[0]
	else:
	self.default_context = await self.create_browser_context()
	# self.default_context = await self.browser.new_context(
	# viewport={
	# "width": self.config.viewport_width,
	# "height": self.config.viewport_height,
	# },
	# storage_state=self.config.storage_state,
	# user_agent=self.config.headers.get(
	# "User-Agent", self.config.user_agent
	# ),
	# accept_downloads=self.config.accept_downloads,
	# ignore_https_errors=self.config.ignore_https_errors,
	# java_script_enabled=self.config.java_script_enabled,
	# )
	await self.setup_context(self.default_context)
	else:
	browser_args = self._build_browser_args()

	# Launch appropriate browser type
	if self.config.browser_type == "firefox":
	self.browser = await self.playwright.firefox.launch(**browser_args)
	elif self.config.browser_type == "webkit":
	self.browser = await self.playwright.webkit.launch(**browser_args)
	else:
	self.browser = await self.playwright.chromium.launch(**browser_args)

	self.default_context = self.browser

	def _build_browser_args(self) -> dict:
	"""Build browser launch arguments from config."""
	args = [
	"--disable-gpu",
	"--disable-gpu-compositing",
	"--disable-software-rasterizer",
	"--no-sandbox",
	"--disable-dev-shm-usage",
	"--no-first-run",
	"--no-default-browser-check",
	"--disable-infobars",
	"--window-position=0,0",
	"--ignore-certificate-errors",
	"--ignore-certificate-errors-spki-list",
	"--disable-blink-features=AutomationControlled",
	"--window-position=400,0",
	"--disable-renderer-backgrounding",
	"--disable-ipc-flooding-protection",
	"--force-color-profile=srgb",
	"--mute-audio",
	"--disable-background-timer-throttling",
	# "--single-process",
	f"--window-size={self.config.viewport_width},{self.config.viewport_height}",
	]

	if self.config.light_mode:
	args.extend(BROWSER_DISABLE_OPTIONS)

	if self.config.text_mode:
	args.extend(
	[
	"--blink-settings=imagesEnabled=false",
	"--disable-remote-fonts",
	"--disable-images",
	"--disable-javascript",
	"--disable-software-rasterizer",
	"--disable-dev-shm-usage",
	]
	)

	if self.config.extra_args:
	args.extend(self.config.extra_args)

	browser_args = {"headless": self.config.headless, "args": args}

	if self.config.chrome_channel:
	browser_args["channel"] = self.config.chrome_channel

	if self.config.accept_downloads:
	browser_args["downloads_path"] = self.config.downloads_path or os.path.join(
	os.getcwd(), "downloads"
	)
	os.makedirs(browser_args["downloads_path"], exist_ok=True)

	if self.config.proxy or self.config.proxy_config:
	from playwright.async_api import ProxySettings

	proxy_settings = (
	ProxySettings(server=self.config.proxy)
	if self.config.proxy
	else ProxySettings(
	server=self.config.proxy_config.get("server"),
	username=self.config.proxy_config.get("username"),
	password=self.config.proxy_config.get("password"),
	)
	)
	browser_args["proxy"] = proxy_settings

	return browser_args

	async def setup_context(
	self,
	context: BrowserContext,
	crawlerRunConfig: CrawlerRunConfig,
	is_default=False,
	):
	"""
	Set up a browser context with the configured options.

	How it works:
	1. Set extra HTTP headers if provided.
	2. Add cookies if provided.
	3. Load storage state if provided.
	4. Accept downloads if enabled.
	5. Set default timeouts for navigation and download.
	6. Set user agent if provided.
	7. Set browser hints if provided.
	8. Set proxy if provided.
	9. Set downloads path if provided.
	10. Set storage state if provided.
	11. Set cache if provided.
	12. Set extra HTTP headers if provided.
	13. Add cookies if provided.
	14. Set default timeouts for navigation and download if enabled.
	15. Set user agent if provided.
	16. Set browser hints if provided.

	Args:
	context (BrowserContext): The browser context to set up
	crawlerRunConfig (CrawlerRunConfig): Configuration object containing all browser settings
	is_default (bool): Flag indicating if this is the default context
	Returns:
	None
	"""
	if self.config.headers:
	await context.set_extra_http_headers(self.config.headers)

	if self.config.cookies:
	await context.add_cookies(self.config.cookies)

	if self.config.storage_state:
	await context.storage_state(path=None)

	if self.config.accept_downloads:
	context.set_default_timeout(DOWNLOAD_PAGE_TIMEOUT)
	context.set_default_navigation_timeout(DOWNLOAD_PAGE_TIMEOUT)
	if self.config.downloads_path:
	context._impl_obj._options["accept_downloads"] = True
	context._impl_obj._options["downloads_path"] = (
	self.config.downloads_path
	)

	# Handle user agent and browser hints
	if self.config.user_agent:
	combined_headers = {
	"User-Agent": self.config.user_agent,
	"sec-ch-ua": self.config.browser_hint,
	}
	combined_headers.update(self.config.headers)
	await context.set_extra_http_headers(combined_headers)

	# Add default cookie
	await context.add_cookies(
	[{"name": "cookiesEnabled", "value": "true", "url": crawlerRunConfig.url}]
	)

	# Handle navigator overrides
	if (
	crawlerRunConfig.override_navigator
	or crawlerRunConfig.simulate_user
	or crawlerRunConfig.magic
	):
	await context.add_init_script(load_js_script("navigator_overrider"))

	async def create_browser_context(self):
	"""
	Creates and returns a new browser context with configured settings.
	Applies text-only mode settings if text_mode is enabled in config.

	Returns:
	Context: Browser context object with the specified configurations
	"""
	# Base settings
	user_agent = self.config.headers.get("User-Agent", self.config.user_agent)
	viewport_settings = {
	"width": self.config.viewport_width,
	"height": self.config.viewport_height,
	}
	proxy_settings = {"server": self.config.proxy} if self.config.proxy else None

	blocked_extensions = [
	# Images
	'jpg', 'jpeg', 'png', 'gif', 'webp', 'svg', 'ico', 'bmp', 'tiff', 'psd',
	# Fonts
	'woff', 'woff2', 'ttf', 'otf', 'eot',
	# Styles
	# 'css', 'less', 'scss', 'sass',
	# Media
	'mp4', 'webm', 'ogg', 'avi', 'mov', 'wmv', 'flv', 'm4v',
	'mp3', 'wav', 'aac', 'm4a', 'opus', 'flac',
	# Documents
	'pdf', 'doc', 'docx', 'xls', 'xlsx', 'ppt', 'pptx',
	# Archives
	'zip', 'rar', '7z', 'tar', 'gz',
	# Scripts and data
	'xml', 'swf', 'wasm'
	]

	# Common context settings
	context_settings = {
	"user_agent": user_agent,
	"viewport": viewport_settings,
	"proxy": proxy_settings,
	"accept_downloads": self.config.accept_downloads,
	"storage_state": self.config.storage_state,
	"ignore_https_errors": self.config.ignore_https_errors,
	"device_scale_factor": 1.0,
	"java_script_enabled": self.config.java_script_enabled,
	}

	if self.config.text_mode:
	text_mode_settings = {
	"has_touch": False,
	"is_mobile": False,
	}
	# Update context settings with text mode settings
	context_settings.update(text_mode_settings)

	# Create and return the context with all settings
	context = await self.browser.new_context(**context_settings)

	# Apply text mode settings if enabled
	if self.config.text_mode:
	# Create and apply route patterns for each extension
	for ext in blocked_extensions:
	await context.route(f"*/.{ext}", lambda route: route.abort())
	return context

	# async def get_page(self, session_id: Optional[str], user_agent: str):
	async def get_page(self, crawlerRunConfig: CrawlerRunConfig):
	"""
	Get a page for the given session ID, creating a new one if needed.

	Args:
	crawlerRunConfig (CrawlerRunConfig): Configuration object containing all browser settings

	Returns:
	Page: The page object for the given session ID.
	BrowserContext: The browser context for the given session ID.
	"""
	self._cleanup_expired_sessions()

	if crawlerRunConfig.session_id and crawlerRunConfig.session_id in self.sessions:
	context, page, _ = self.sessions[crawlerRunConfig.session_id]
	self.sessions[crawlerRunConfig.session_id] = (context, page, time.time())
	return page, context

	if self.config.use_managed_browser:
	context = self.default_context
	page = await context.new_page()
	else:
	context = await self.create_browser_context()
	await self.setup_context(context, crawlerRunConfig)
	page = await context.new_page()

	if crawlerRunConfig.session_id:
	self.sessions[crawlerRunConfig.session_id] = (context, page, time.time())

	return page, context

	async def kill_session(self, session_id: str):
	"""
	Kill a browser session and clean up resources.

	Args:
	session_id (str): The session ID to kill.
	"""
	if session_id in self.sessions:
	context, page, _ = self.sessions[session_id]
	await page.close()
	if not self.config.use_managed_browser:
	await context.close()
	del self.sessions[session_id]

	def _cleanup_expired_sessions(self):
	"""Clean up expired sessions based on TTL."""
	current_time = time.time()
	expired_sessions = [
	sid
	for sid, (_, _, last_used) in self.sessions.items()
	if current_time - last_used > self.session_ttl
	]
	for sid in expired_sessions:
	asyncio.create_task(self.kill_session(sid))

	async def close(self):
	"""Close all browser resources and clean up."""
	if self.config.sleep_on_close:
	await asyncio.sleep(0.5)

	session_ids = list(self.sessions.keys())
	for session_id in session_ids:
	await self.kill_session(session_id)

	if self.browser:
	await self.browser.close()
	self.browser = None

	if self.managed_browser:
	await asyncio.sleep(0.5)
	await self.managed_browser.cleanup()
	self.managed_browser = None

	if self.playwright:
	await self.playwright.stop()
	self.playwright = None


	class AsyncCrawlerStrategy(ABC):
	"""
	Abstract base class for crawler strategies.
	Subclasses must implement the crawl method.
	"""
	@abstractmethod
	async def crawl(self, url: str, **kwargs) -> AsyncCrawlResponse:
	pass # 4 + 3



	class AsyncPlaywrightCrawlerStrategy(AsyncCrawlerStrategy):
	"""
	Crawler strategy using Playwright.

	Attributes:
	browser_config (BrowserConfig): Configuration object containing browser settings.
	logger (AsyncLogger): Logger instance for recording events and errors.
	_downloaded_files (List[str]): List of downloaded file paths.
	hooks (Dict[str, Callable]): Dictionary of hooks for custom behavior.
	browser_manager (BrowserManager): Manager for browser creation and management.

	Methods:
	__init__(self, browser_config=None, logger=None, **kwargs):
	Initialize the AsyncPlaywrightCrawlerStrategy with a browser configuration.
	__aenter__(self):
	Start the browser and initialize the browser manager.
	__aexit__(self, exc_type, exc_val, exc_tb):
	Close the browser and clean up resources.
	start(self):
	Start the browser and initialize the browser manager.
	close(self):
	Close the browser and clean up resources.
	kill_session(self, session_id):
	Kill a browser session and clean up resources.
	crawl(self, url, **kwargs):
	Run the crawler for a single URL.

	"""
	def __init__(
	self, browser_config: BrowserConfig = None, logger: AsyncLogger = None, **kwargs
	):
	"""
	Initialize the AsyncPlaywrightCrawlerStrategy with a browser configuration.

	Args:
	browser_config (BrowserConfig): Configuration object containing browser settings.
	If None, will be created from kwargs for backwards compatibility.
	logger: Logger instance for recording events and errors.
	**kwargs: Additional arguments for backwards compatibility and extending functionality.
	"""
	# Initialize browser config, either from provided object or kwargs
	self.browser_config = browser_config or BrowserConfig.from_kwargs(kwargs)
	self.logger = logger

	# Initialize session management
	self._downloaded_files = []

	# Initialize hooks system
	self.hooks = {
	"on_browser_created": None,
	"on_page_context_created": None,
	"on_user_agent_updated": None,
	"on_execution_started": None,
	"before_goto": None,
	"after_goto": None,
	"before_return_html": None,
	"before_retrieve_html": None,
	}

	# Initialize browser manager with config
	self.browser_manager = BrowserManager(
	browser_config=self.browser_config, logger=self.logger
	)

	async def __aenter__(self):
	await self.start()
	return self

	async def __aexit__(self, exc_type, exc_val, exc_tb):
	await self.close()

	async def start(self):
	"""
	Start the browser and initialize the browser manager.
	"""
	await self.browser_manager.start()
	await self.execute_hook(
	"on_browser_created",
	self.browser_manager.browser,
	context=self.browser_manager.default_context,
	)

	async def close(self):
	"""
	Close the browser and clean up resources.
	"""
	await self.browser_manager.close()

	async def kill_session(self, session_id: str):
	"""
	Kill a browser session and clean up resources.

	Args:
	session_id (str): The ID of the session to kill.

	Returns:
	None
	"""
	# Log a warning message and no need kill session, in new version auto kill session
	self.logger.warning(
	message="Session auto-kill is enabled in the new version. No need to manually kill sessions.",
	tag="WARNING",
	)
	await self.browser_manager.kill_session(session_id)

	def set_hook(self, hook_type: str, hook: Callable):
	"""
	Set a hook function for a specific hook type. Following are list of hook types:
	- on_browser_created: Called when a new browser instance is created.
	- on_page_context_created: Called when a new page context is created.
	- on_user_agent_updated: Called when the user agent is updated.
	- on_execution_started: Called when the execution starts.
	- before_goto: Called before a goto operation.
	- after_goto: Called after a goto operation.
	- before_return_html: Called before returning HTML content.
	- before_retrieve_html: Called before retrieving HTML content.

	All hooks except on_browser_created accepts a context and a page as arguments and kwargs. However, on_browser_created accepts a browser and a context as arguments and kwargs.

	Args:
	hook_type (str): The type of the hook.
	hook (Callable): The hook function to set.

	Returns:
	None
	"""
	if hook_type in self.hooks:
	self.hooks[hook_type] = hook
	else:
	raise ValueError(f"Invalid hook type: {hook_type}")

	async def execute_hook(self, hook_type: str, args, *kwargs):
	"""
	Execute a hook function for a specific hook type.

	Args:
	hook_type (str): The type of the hook.
	*args: Variable length positional arguments.
	**kwargs: Keyword arguments.

	Returns:
	The return value of the hook function, if any.
	"""
	hook = self.hooks.get(hook_type)
	if hook:
	if asyncio.iscoroutinefunction(hook):
	return await hook(args, *kwargs)
	else:
	return hook(args, *kwargs)
	return args[0] if args else None

	def update_user_agent(self, user_agent: str):
	"""
	Update the user agent for the browser.

	Args:
	user_agent (str): The new user agent string.

	Returns:
	None
	"""
	self.user_agent = user_agent

	def set_custom_headers(self, headers: Dict[str, str]):
	"""
	Set custom headers for the browser.

	Args:
	headers (Dict[str, str]): A dictionary of headers to set.

	Returns:
	None
	"""
	self.headers = headers

	async def smart_wait(self, page: Page, wait_for: str, timeout: float = 30000):
	"""
	Wait for a condition in a smart way. This functions works as below:

	1. If wait_for starts with 'js:', it assumes it's a JavaScript function and waits for it to return true.
	2. If wait_for starts with 'css:', it assumes it's a CSS selector and waits for it to be present.
	3. Otherwise, it tries to evaluate wait_for as a JavaScript function and waits for it to return true.
	4. If it's not a JavaScript function, it assumes it's a CSS selector and waits for it to be present.

	This is a more advanced version of the wait_for parameter in CrawlerStrategy.crawl().
	Args:
	page: Playwright page object
	wait_for (str): The condition to wait for. Can be a CSS selector, a JavaScript function, or explicitly prefixed with 'js:' or 'css:'.
	timeout (float): Maximum time to wait in milliseconds

	Returns:
	None
	"""
	wait_for = wait_for.strip()

	if wait_for.startswith("js:"):
	# Explicitly specified JavaScript
	js_code = wait_for[3:].strip()
	return await self.csp_compliant_wait(page, js_code, timeout)
	elif wait_for.startswith("css:"):
	# Explicitly specified CSS selector
	css_selector = wait_for[4:].strip()
	try:
	await page.wait_for_selector(css_selector, timeout=timeout)
	except Error as e:
	if "Timeout" in str(e):
	raise TimeoutError(
	f"Timeout after {timeout}ms waiting for selector '{css_selector}'"
	)
	else:
	raise ValueError(f"Invalid CSS selector: '{css_selector}'")
	else:
	# Auto-detect based on content
	if wait_for.startswith("()") or wait_for.startswith("function"):
	# It's likely a JavaScript function
	return await self.csp_compliant_wait(page, wait_for, timeout)
	else:
	# Assume it's a CSS selector first
	try:
	await page.wait_for_selector(wait_for, timeout=timeout)
	except Error as e:
	if "Timeout" in str(e):
	raise TimeoutError(
	f"Timeout after {timeout}ms waiting for selector '{wait_for}'"
	)
	else:
	# If it's not a timeout error, it might be an invalid selector
	# Let's try to evaluate it as a JavaScript function as a fallback
	try:
	return await self.csp_compliant_wait(
	page, f"() => {{{wait_for}}}", timeout
	)
	except Error:
	raise ValueError(
	f"Invalid wait_for parameter: '{wait_for}'. "
	"It should be either a valid CSS selector, a JavaScript function, "
	"or explicitly prefixed with 'js:' or 'css:'."
	)

	async def csp_compliant_wait( self, page: Page, user_wait_function: str, timeout: float = 30000 ):
	"""
	Wait for a condition in a CSP-compliant way.

	Args:
	page: Playwright page object
	user_wait_function: JavaScript function as string that returns boolean
	timeout: Maximum time to wait in milliseconds

	Returns:
	bool: True if condition was met, False if timed out

	Raises:
	RuntimeError: If there's an error evaluating the condition
	"""
	wrapper_js = f"""
	async () => {{
	const userFunction = {user_wait_function};
	const startTime = Date.now();
	try {{
	while (true) {{
	if (await userFunction()) {{
	return true;
	}}
	if (Date.now() - startTime > {timeout}) {{
	return false; // Return false instead of throwing
	}}
	await new Promise(resolve => setTimeout(resolve, 100));
	}}
	}} catch (error) {{
	throw new Error(`Error evaluating condition: ${{error.message}}`);
	}}
	}}
	"""

	try:
	result = await page.evaluate(wrapper_js)
	return result
	except Exception as e:
	if "Error evaluating condition" in str(e):
	raise RuntimeError(f"Failed to evaluate wait condition: {str(e)}")
	# For timeout or other cases, just return False
	return False

	async def process_iframes(self, page):
	"""
	Process iframes on a page. This function will extract the content of each iframe and replace it with a div containing the extracted content.

	Args:
	page: Playwright page object

	Returns:
	Playwright page object
	"""
	# Find all iframes
	iframes = await page.query_selector_all("iframe")

	for i, iframe in enumerate(iframes):
	try:
	# Add a unique identifier to the iframe
	await iframe.evaluate(f'(element) => element.id = "iframe-{i}"')

	# Get the frame associated with this iframe
	frame = await iframe.content_frame()

	if frame:
	# Wait for the frame to load
	await frame.wait_for_load_state(
	"load", timeout=30000
	) # 30 seconds timeout

	# Extract the content of the iframe's body
	iframe_content = await frame.evaluate(
	"() => document.body.innerHTML"
	)

	# Generate a unique class name for this iframe
	class_name = f"extracted-iframe-content-{i}"

	# Replace the iframe with a div containing the extracted content
	_iframe = iframe_content.replace("`", "\\`")
	await page.evaluate(
	f"""
	() => {{
	const iframe = document.getElementById('iframe-{i}');
	const div = document.createElement('div');
	div.innerHTML = `{_iframe}`;
	div.className = '{class_name}';
	iframe.replaceWith(div);
	}}
	"""
	)
	else:
	self.logger.warning(
	message="Could not access content frame for iframe {index}",
	tag="SCRAPE",
	params={"index": i},
	)
	except Exception as e:
	self.logger.error(
	message="Error processing iframe {index}: {error}",
	tag="ERROR",
	params={"index": i, "error": str(e)},
	)

	# Return the page object
	return page

	async def create_session(self, **kwargs) -> str:
	"""
	Creates a new browser session and returns its ID. A browse session is a unique openned page can be reused for multiple crawls.
	This function is asynchronous and returns a string representing the session ID.

	Args:
	**kwargs: Optional keyword arguments to configure the session.

	Returns:
	str: The session ID.
	"""
	await self.start()

	session_id = kwargs.get("session_id") or str(uuid.uuid4())

	user_agent = kwargs.get("user_agent", self.user_agent)
	# Use browser_manager to get a fresh page & context assigned to this session_id
	page, context = await self.browser_manager.get_page(session_id, user_agent)
	return session_id

	async def crawl( self, url: str, config: CrawlerRunConfig, **kwargs ) -> AsyncCrawlResponse:
	"""
	Crawls a given URL or processes raw HTML/local file content based on the URL prefix.

	Args:
	url (str): The URL to crawl. Supported prefixes:
	- 'http://' or 'https://': Web URL to crawl.
	- 'file://': Local file path to process.
	- 'raw://': Raw HTML content to process.
	**kwargs: Additional parameters:
	- 'screenshot' (bool): Whether to take a screenshot.
	- ... [other existing parameters]

	Returns:
	AsyncCrawlResponse: The response containing HTML, headers, status code, and optional screenshot.
	"""
	config = config or CrawlerRunConfig.from_kwargs(kwargs)
	response_headers = {}
	status_code = 200 # Default for local/raw HTML
	screenshot_data = None

	if url.startswith(("http://", "https://")):
	return await self._crawl_web(url, config)

	elif url.startswith("file://"):
	# Process local file
	local_file_path = url[7:] # Remove 'file://' prefix
	if not os.path.exists(local_file_path):
	raise FileNotFoundError(f"Local file not found: {local_file_path}")
	with open(local_file_path, "r", encoding="utf-8") as f:
	html = f.read()
	if config.screenshot:
	screenshot_data = await self._generate_screenshot_from_html(html)
	return AsyncCrawlResponse(
	html=html,
	response_headers=response_headers,
	status_code=status_code,
	screenshot=screenshot_data,
	get_delayed_content=None,
	)

	elif url.startswith("raw:") or url.startswith("raw://"):
	# Process raw HTML content
	raw_html = url[4:] if url[:4] == "raw:" else url[7:]
	html = raw_html
	if config.screenshot:
	screenshot_data = await self._generate_screenshot_from_html(html)
	return AsyncCrawlResponse(
	html=html,
	response_headers=response_headers,
	status_code=status_code,
	screenshot=screenshot_data,
	get_delayed_content=None,
	)
	else:
	raise ValueError(
	"URL must start with 'http://', 'https://', 'file://', or 'raw:'"
	)

	async def _crawl_web( self, url: str, config: CrawlerRunConfig ) -> AsyncCrawlResponse:
	"""
	Internal method to crawl web URLs with the specified configuration.

	Args:
	url (str): The web URL to crawl
	config (CrawlerRunConfig): Configuration object controlling the crawl behavior

	Returns:
	AsyncCrawlResponse: The response containing HTML, headers, status code, and optional data
	"""
	config.url = url
	response_headers = {}
	status_code = None

	# Reset downloaded files list for new crawl
	self._downloaded_files = []

	# Handle user agent with magic mode
	user_agent = self.browser_config.user_agent
	if config.magic and self.browser_config.user_agent_mode != "random":
	self.browser_config.user_agent = UserAgentGenerator().generate(
	**(self.browser_config.user_agent_generator_config or {})
	)

	# Get page for session
	page, context = await self.browser_manager.get_page(crawlerRunConfig=config)

	# Add default cookie
	await context.add_cookies(
	[{"name": "cookiesEnabled", "value": "true", "url": url}]
	)

	# Handle navigator overrides
	if config.override_navigator or config.simulate_user or config.magic:
	await context.add_init_script(load_js_script("navigator_overrider"))

	# Call hook after page creation
	await self.execute_hook("on_page_context_created", page, context=context)

	# Set up console logging if requested
	if config.log_console:

	def log_consol(
	msg, console_log_type="debug"
	): # Corrected the parameter syntax
	if console_log_type == "error":
	self.logger.error(
	message=f"Console error: {msg}", # Use f-string for variable interpolation
	tag="CONSOLE",
	params={"msg": msg.text},
	)
	elif console_log_type == "debug":
	self.logger.debug(
	message=f"Console: {msg}", # Use f-string for variable interpolation
	tag="CONSOLE",
	params={"msg": msg.text},
	)

	page.on("console", log_consol)
	page.on("pageerror", lambda e: log_consol(e, "error"))

	try:
	# Get SSL certificate information if requested and URL is HTTPS
	ssl_cert = None
	if config.fetch_ssl_certificate:
	ssl_cert = SSLCertificate.from_url(url)

	# Set up download handling
	if self.browser_config.accept_downloads:
	page.on(
	"download",
	lambda download: asyncio.create_task(
	self._handle_download(download)
	),
	)

	# Handle page navigation and content loading
	if not config.js_only:
	await self.execute_hook("before_goto", page, context=context, url=url)

	try:
	# Generate a unique nonce for this request
	nonce = hashlib.sha256(os.urandom(32)).hexdigest()

	# Add CSP headers to the request
	await page.set_extra_http_headers({
	'Content-Security-Policy': f"default-src 'self'; script-src 'self' 'nonce-{nonce}' 'strict-dynamic'"
	})

	response = await page.goto(
	url, wait_until=config.wait_until, timeout=config.page_timeout
	)
	except Error as e:
	raise RuntimeError(f"Failed on navigating ACS-GOTO:\n{str(e)}")

	await self.execute_hook("after_goto", page, context=context, url=url, response=response)

	if response is None:
	status_code = 200
	response_headers = {}
	else:
	status_code = response.status
	response_headers = response.headers

	else:
	status_code = 200
	response_headers = {}

	# Wait for body element and visibility
	try:
	await page.wait_for_selector("body", state="attached", timeout=30000)

	# Use the new check_visibility function with csp_compliant_wait
	is_visible = await self.csp_compliant_wait(
	page,
	"""() => {
	const element = document.body;
	if (!element) return false;
	const style = window.getComputedStyle(element);
	const isVisible = style.display !== 'none' &&
	style.visibility !== 'hidden' &&
	style.opacity !== '0';
	return isVisible;
	}""",
	timeout=30000
	)

	if not is_visible and not config.ignore_body_visibility:
	visibility_info = await self.check_visibility(page)
	raise Error(f"Body element is hidden: {visibility_info}")

	except Error as e:
	visibility_info = await self.check_visibility(page)

	if self.config.verbose:
	self.logger.debug(
	message="Body visibility info: {info}",
	tag="DEBUG",
	params={"info": visibility_info},
	)

	if not config.ignore_body_visibility:
	raise Error(f"Body element is hidden: {visibility_info}")


	# try:
	# await page.wait_for_selector("body", state="attached", timeout=30000)

	# await page.wait_for_function(
	# """
	# () => {
	# const body = document.body;
	# const style = window.getComputedStyle(body);
	# return style.display !== 'none' &&
	# style.visibility !== 'hidden' &&
	# style.opacity !== '0';
	# }
	# """,
	# timeout=30000,
	# )
	# except Error as e:
	# visibility_info = await page.evaluate(
	# """
	# () => {
	# const body = document.body;
	# const style = window.getComputedStyle(body);
	# return {
	# display: style.display,
	# visibility: style.visibility,
	# opacity: style.opacity,
	# hasContent: body.innerHTML.length,
	# classList: Array.from(body.classList)
	# }
	# }
	# """
	# )

	# if self.config.verbose:
	# self.logger.debug(
	# message="Body visibility info: {info}",
	# tag="DEBUG",
	# params={"info": visibility_info},
	# )

	# if not config.ignore_body_visibility:
	# raise Error(f"Body element is hidden: {visibility_info}")

	# Handle content loading and viewport adjustment
	if not self.browser_config.text_mode and (
	config.wait_for_images or config.adjust_viewport_to_content
	):
	await page.wait_for_load_state("domcontentloaded")
	await asyncio.sleep(0.1)

	# Check for image loading with improved error handling
	images_loaded = await self.csp_compliant_wait(
	page,
	"() => Array.from(document.getElementsByTagName('img')).every(img => img.complete)",
	timeout=1000
	)

	if not images_loaded and self.logger:
	self.logger.warning(
	message="Some images failed to load within timeout",
	tag="SCRAPE",
	)

	# Adjust viewport if needed
	if not self.browser_config.text_mode and config.adjust_viewport_to_content:
	try:
	dimensions = await self.get_page_dimensions(page)
	page_height = dimensions['height']
	page_width = dimensions['width']
	# page_width = await page.evaluate(
	# "document.documentElement.scrollWidth"
	# )
	# page_height = await page.evaluate(
	# "document.documentElement.scrollHeight"
	# )

	target_width = self.browser_config.viewport_width
	target_height = int(target_width * page_width / page_height * 0.95)
	await page.set_viewport_size(
	{"width": target_width, "height": target_height}
	)

	scale = min(target_width / page_width, target_height / page_height)
	cdp = await page.context.new_cdp_session(page)
	await cdp.send(
	"Emulation.setDeviceMetricsOverride",
	{
	"width": page_width,
	"height": page_height,
	"deviceScaleFactor": 1,
	"mobile": False,
	"scale": scale,
	},
	)
	except Exception as e:
	self.logger.warning(
	message="Failed to adjust viewport to content: {error}",
	tag="VIEWPORT",
	params={"error": str(e)},
	)

	# Handle full page scanning
	if config.scan_full_page:
	await self._handle_full_page_scan(page, config.scroll_delay)

	# Execute JavaScript if provided
	# if config.js_code:
	# if isinstance(config.js_code, str):
	# await page.evaluate(config.js_code)
	# elif isinstance(config.js_code, list):
	# for js in config.js_code:
	# await page.evaluate(js)

	if config.js_code:
	# execution_result = await self.execute_user_script(page, config.js_code)
	execution_result = await self.robust_execute_user_script(page, config.js_code)
	if not execution_result["success"]:
	self.logger.warning(
	message="User script execution had issues: {error}",
	tag="JS_EXEC",
	params={"error": execution_result.get("error")}
	)

	await self.execute_hook("on_execution_started", page, context=context)

	# Handle user simulation
	if config.simulate_user or config.magic:
	await page.mouse.move(100, 100)
	await page.mouse.down()
	await page.mouse.up()
	await page.keyboard.press("ArrowDown")

	# Handle wait_for condition
	if config.wait_for:
	try:
	await self.smart_wait(
	page, config.wait_for, timeout=config.page_timeout
	)
	except Exception as e:
	raise RuntimeError(f"Wait condition failed: {str(e)}")

	# Update image dimensions if needed
	if not self.browser_config.text_mode:
	update_image_dimensions_js = load_js_script("update_image_dimensions")
	try:
	try:
	await page.wait_for_load_state("domcontentloaded", timeout=5)
	except PlaywrightTimeoutError:
	pass
	await page.evaluate(update_image_dimensions_js)
	except Exception as e:
	self.logger.error(
	message="Error updating image dimensions: {error}",
	tag="ERROR",
	params={"error": str(e)},
	)

	# Process iframes if needed
	if config.process_iframes:
	page = await self.process_iframes(page)

	# Pre-content retrieval hooks and delay
	await self.execute_hook("before_retrieve_html", page, context=context)
	if config.delay_before_return_html:
	await asyncio.sleep(config.delay_before_return_html)

	# Handle overlay removal
	if config.remove_overlay_elements:
	await self.remove_overlay_elements(page)

	# Get final HTML content
	html = await page.content()
	await self.execute_hook("before_return_html", page = page, html = html, context=context)

	# Handle PDF and screenshot generation
	start_export_time = time.perf_counter()
	pdf_data = None
	screenshot_data = None

	if config.pdf:
	pdf_data = await self.export_pdf(page)

	if config.screenshot:
	if config.screenshot_wait_for:
	await asyncio.sleep(config.screenshot_wait_for)
	screenshot_data = await self.take_screenshot(
	page, screenshot_height_threshold=config.screenshot_height_threshold
	)

	if screenshot_data or pdf_data:
	self.logger.info(
	message="Exporting PDF and taking screenshot took {duration:.2f}s",
	tag="EXPORT",
	params={"duration": time.perf_counter() - start_export_time},
	)

	# Define delayed content getter
	async def get_delayed_content(delay: float = 5.0) -> str:
	self.logger.info(
	message="Waiting for {delay} seconds before retrieving content for {url}",
	tag="INFO",
	params={"delay": delay, "url": url},
	)
	await asyncio.sleep(delay)
	return await page.content()

	# Return complete response
	return AsyncCrawlResponse(
	html=html,
	response_headers=response_headers,
	status_code=status_code,
	screenshot=screenshot_data,
	pdf_data=pdf_data,
	get_delayed_content=get_delayed_content,
	ssl_certificate=ssl_cert,
	downloaded_files=(
	self._downloaded_files if self._downloaded_files else None
	),
	)

	except Exception as e:
	raise e

	finally:
	# If no session_id is given we should close the page
	if not config.session_id:
	await page.close()

	async def _handle_full_page_scan(self, page: Page, scroll_delay: float = 0.1):
	"""
	Helper method to handle full page scanning.

	How it works:
	1. Get the viewport height.
	2. Scroll to the bottom of the page.
	3. Get the total height of the page.
	4. Scroll back to the top of the page.
	5. Scroll to the bottom of the page again.
	6. Continue scrolling until the bottom of the page is reached.

	Args:
	page (Page): The Playwright page object
	scroll_delay (float): The delay between page scrolls

	"""
	try:
	viewport_height = page.viewport_size.get(
	"height", self.browser_config.viewport_height
	)
	current_position = viewport_height

	# await page.evaluate(f"window.scrollTo(0, {current_position})")
	await self.safe_scroll(page, 0, current_position, delay=scroll_delay)
	# await self.csp_scroll_to(page, 0, current_position)
	# await asyncio.sleep(scroll_delay)

	# total_height = await page.evaluate("document.documentElement.scrollHeight")
	dimensions = await self.get_page_dimensions(page)
	total_height = dimensions['height']

	while current_position < total_height:
	current_position = min(current_position + viewport_height, total_height)
	await self.safe_scroll(page, 0, current_position, delay=scroll_delay)
	# await page.evaluate(f"window.scrollTo(0, {current_position})")
	# await asyncio.sleep(scroll_delay)

	# new_height = await page.evaluate("document.documentElement.scrollHeight")
	dimensions = await self.get_page_dimensions(page)
	new_height = dimensions['height']

	if new_height > total_height:
	total_height = new_height

	# await page.evaluate("window.scrollTo(0, 0)")
	await self.safe_scroll(page, 0, 0)

	except Exception as e:
	self.logger.warning(
	message="Failed to perform full page scan: {error}",
	tag="PAGE_SCAN",
	params={"error": str(e)},
	)
	else:
	# await page.evaluate("window.scrollTo(0, document.body.scrollHeight)")
	await self.safe_scroll(page, 0, total_height)

	async def _handle_download(self, download):
	"""
	Handle file downloads.

	How it works:
	1. Get the suggested filename.
	2. Get the download path.
	3. Log the download.
	4. Start the download.
	5. Save the downloaded file.
	6. Log the completion.

	Args:
	download (Download): The Playwright download object

	Returns:
	None
	"""
	try:
	suggested_filename = download.suggested_filename
	download_path = os.path.join(self.downloads_path, suggested_filename)

	self.logger.info(
	message="Downloading {filename} to {path}",
	tag="FETCH",
	params={"filename": suggested_filename, "path": download_path},
	)

	start_time = time.perf_counter()
	await download.save_as(download_path)
	end_time = time.perf_counter()
	self._downloaded_files.append(download_path)

	self.logger.success(
	message="Downloaded {filename} successfully",
	tag="COMPLETE",
	params={
	"filename": suggested_filename,
	"path": download_path,
	"duration": f"{end_time - start_time:.2f}s",
	},
	)
	except Exception as e:
	self.logger.error(
	message="Failed to handle download: {error}",
	tag="ERROR",
	params={"error": str(e)},
	)

	async def remove_overlay_elements(self, page: Page) -> None:
	"""
	Removes popup overlays, modals, cookie notices, and other intrusive elements from the page.

	Args:
	page (Page): The Playwright page instance
	"""
	remove_overlays_js = load_js_script("remove_overlay_elements")

	try:
	await page.evaluate(f"""
	(() => {{
	try {{
	{remove_overlays_js}
	return {{ success: true }};
	}} catch (error) {{
	return {{
	success: false,
	error: error.toString(),
	stack: error.stack
	}};
	}}
	}})()
	""")
	await page.wait_for_timeout(500) # Wait for any animations to complete
	except Exception as e:
	self.logger.warning(
	message="Failed to remove overlay elements: {error}",
	tag="SCRAPE",
	params={"error": str(e)},
	)

	async def export_pdf(self, page: Page) -> bytes:
	"""
	Exports the current page as a PDF.

	Args:
	page (Page): The Playwright page object

	Returns:
	bytes: The PDF data
	"""
	pdf_data = await page.pdf(print_background=True)
	return pdf_data

	async def take_screenshot(self, page, **kwargs) -> str:
	"""
	Take a screenshot of the current page.

	Args:
	page (Page): The Playwright page object
	kwargs: Additional keyword arguments

	Returns:
	str: The base64-encoded screenshot data
	"""
	need_scroll = await self.page_need_scroll(page)

	if not need_scroll:
	# Page is short enough, just take a screenshot
	return await self.take_screenshot_naive(page)
	else:
	# Page is too long, try to take a full-page screenshot
	return await self.take_screenshot_scroller(page, **kwargs)
	# return await self.take_screenshot_from_pdf(await self.export_pdf(page))

	async def take_screenshot_from_pdf(self, pdf_data: bytes) -> str:
	"""
	Convert the first page of the PDF to a screenshot.

	Requires pdf2image and poppler.

	Args:
	pdf_data (bytes): The PDF data

	Returns:
	str: The base64-encoded screenshot data
	"""
	try:
	from pdf2image import convert_from_bytes

	images = convert_from_bytes(pdf_data)
	final_img = images[0].convert("RGB")
	buffered = BytesIO()
	final_img.save(buffered, format="JPEG")
	return base64.b64encode(buffered.getvalue()).decode("utf-8")
	except Exception as e:
	error_message = f"Failed to take PDF-based screenshot: {str(e)}"
	self.logger.error(
	message="PDF Screenshot failed: {error}",
	tag="ERROR",
	params={"error": error_message},
	)
	# Return error image as fallback
	img = Image.new("RGB", (800, 600), color="black")
	draw = ImageDraw.Draw(img)
	font = ImageFont.load_default()
	draw.text((10, 10), error_message, fill=(255, 255, 255), font=font)
	buffered = BytesIO()
	img.save(buffered, format="JPEG")
	return base64.b64encode(buffered.getvalue()).decode("utf-8")

	async def take_screenshot_scroller(self, page: Page, **kwargs) -> str:
	"""
	Attempt to set a large viewport and take a full-page screenshot.
	If still too large, segment the page as before.

	Requires pdf2image and poppler.

	Args:
	page (Page): The Playwright page object
	kwargs: Additional keyword arguments

	Returns:
	str: The base64-encoded screenshot data
	"""
	try:
	# Get page height
	dimensions = await self.get_page_dimensions(page)
	page_width = dimensions['width']
	page_height = dimensions['height']
	# page_height = await page.evaluate("document.documentElement.scrollHeight")
	# page_width = await page.evaluate("document.documentElement.scrollWidth")

	# Set a large viewport
	large_viewport_height = min(
	page_height,
	kwargs.get("screenshot_height_threshold", SCREENSHOT_HEIGHT_TRESHOLD),
	)
	await page.set_viewport_size(
	{"width": page_width, "height": large_viewport_height}
	)

	# Page still too long, segment approach
	segments = []
	viewport_size = page.viewport_size
	viewport_height = viewport_size["height"]

	num_segments = (page_height // viewport_height) + 1
	for i in range(num_segments):
	y_offset = i * viewport_height
	await page.evaluate(f"window.scrollTo(0, {y_offset})")
	await asyncio.sleep(0.01) # wait for render
	seg_shot = await page.screenshot(full_page=False)
	img = Image.open(BytesIO(seg_shot)).convert("RGB")
	segments.append(img)

	total_height = sum(img.height for img in segments)
	stitched = Image.new("RGB", (segments[0].width, total_height))
	offset = 0
	for img in segments:
	# stitched.paste(img, (0, offset))
	stitched.paste(img.convert("RGB"), (0, offset))
	offset += img.height

	buffered = BytesIO()
	stitched = stitched.convert("RGB")
	stitched.save(buffered, format="BMP", quality=85)
	encoded = base64.b64encode(buffered.getvalue()).decode("utf-8")

	return encoded
	except Exception as e:
	error_message = f"Failed to take large viewport screenshot: {str(e)}"
	self.logger.error(
	message="Large viewport screenshot failed: {error}",
	tag="ERROR",
	params={"error": error_message},
	)
	# return error image
	img = Image.new("RGB", (800, 600), color="black")
	draw = ImageDraw.Draw(img)
	font = ImageFont.load_default()
	draw.text((10, 10), error_message, fill=(255, 255, 255), font=font)
	buffered = BytesIO()
	img.save(buffered, format="JPEG")
	return base64.b64encode(buffered.getvalue()).decode("utf-8")
	finally:
	await page.close()

	async def take_screenshot_naive(self, page: Page) -> str:
	"""
	Takes a screenshot of the current page.

	Args:
	page (Page): The Playwright page instance

	Returns:
	str: Base64-encoded screenshot image
	"""
	try:
	# The page is already loaded, just take the screenshot
	screenshot = await page.screenshot(full_page=False)
	return base64.b64encode(screenshot).decode("utf-8")
	except Exception as e:
	error_message = f"Failed to take screenshot: {str(e)}"
	self.logger.error(
	message="Screenshot failed: {error}",
	tag="ERROR",
	params={"error": error_message},
	)

	# Generate an error image
	img = Image.new("RGB", (800, 600), color="black")
	draw = ImageDraw.Draw(img)
	font = ImageFont.load_default()
	draw.text((10, 10), error_message, fill=(255, 255, 255), font=font)

	buffered = BytesIO()
	img.save(buffered, format="JPEG")
	return base64.b64encode(buffered.getvalue()).decode("utf-8")
	finally:
	await page.close()

	async def export_storage_state(self, path: str = None) -> dict:
	"""
	Exports the current storage state (cookies, localStorage, sessionStorage)
	to a JSON file at the specified path.

	Args:
	path (str): The path to save the storage state JSON file

	Returns:
	dict: The exported storage state
	"""
	if self.default_context:
	state = await self.default_context.storage_state(path=path)
	self.logger.info(
	message="Exported storage state to {path}",
	tag="INFO",
	params={"path": path},
	)
	return state
	else:
	self.logger.warning(
	message="No default_context available to export storage state.",
	tag="WARNING",
	)

	async def robust_execute_user_script(self, page: Page, js_code: Union[str, List[str]]) -> Dict[str, Any]:
	"""
	Executes user-provided JavaScript code with proper error handling and context,
	supporting both synchronous and async user code, plus navigations.

	How it works:
	1. Wait for load state 'domcontentloaded'
	2. If js_code is a string, execute it directly
	3. If js_code is a list, execute each element in sequence
	4. Wait for load state 'networkidle'
	5. Return results

	Args:
	page (Page): The Playwright page instance
	js_code (Union[str, List[str]]): The JavaScript code to execute

	Returns:
	Dict[str, Any]: The results of the execution
	"""
	try:
	await page.wait_for_load_state('domcontentloaded')

	if isinstance(js_code, str):
	scripts = [js_code]
	else:
	scripts = js_code

	results = []
	for script in scripts:
	try:
	# Attempt the evaluate
	# If the user code triggers navigation, we catch the "context destroyed" error
	# then wait for the new page to load before continuing
	result = None
	try:
	result = await page.evaluate(f"""
	(async () => {{
	try {{
	{script}
	return {{ success: true }};
	}} catch (err) {{
	return {{ success: false, error: err.toString(), stack: err.stack }};
	}}
	}})();
	""")
	except Error as e:
	# If it's due to navigation destroying the context, handle gracefully
	if "Execution context was destroyed" in str(e):
	self.logger.info("Navigation triggered by script, waiting for load state", tag="JS_EXEC")
	try:
	await page.wait_for_load_state('load', timeout=30000)
	except Error as nav_err:
	self.logger.warning(
	message="Navigation wait failed: {error}",
	tag="JS_EXEC",
	params={"error": str(nav_err)}
	)
	try:
	await page.wait_for_load_state('networkidle', timeout=30000)
	except Error as nav_err:
	self.logger.warning(
	message="Network idle wait failed: {error}",
	tag="JS_EXEC",
	params={"error": str(nav_err)}
	)
	# Return partial success, or adapt as you see fit
	result = {
	"success": True,
	"info": "Navigation triggered, ignoring context destroyed error"
	}
	else:
	# It's some other error, log and continue
	self.logger.error(
	message="Playwright execution error: {error}",
	tag="JS_EXEC",
	params={"error": str(e)}
	)
	result = {"success": False, "error": str(e)}

	# If we made it this far with no repeated error, do post-load waits
	t1 = time.time()
	try:
	await page.wait_for_load_state('domcontentloaded', timeout=5000)
	print("DOM content loaded after script execution in", time.time() - t1)
	except Error as e:
	self.logger.warning(
	message="DOM content load timeout: {error}",
	tag="JS_EXEC",
	params={"error": str(e)}
	)

	# t1 = time.time()
	# try:
	# await page.wait_for_load_state('networkidle', timeout=5000)
	# print("Network idle after script execution in", time.time() - t1)
	# except Error as e:
	# self.logger.warning(
	# message="Network idle timeout: {error}",
	# tag="JS_EXEC",
	# params={"error": str(e)}
	# )

	results.append(result if result else {"success": True})

	except Exception as e:
	# Catch anything else
	self.logger.error(
	message="Script chunk failed: {error}",
	tag="JS_EXEC",
	params={"error": str(e)}
	)
	results.append({"success": False, "error": str(e)})

	return {"success": True, "results": results}

	except Exception as e:
	self.logger.error(
	message="Script execution failed: {error}",
	tag="JS_EXEC",
	params={"error": str(e)}
	)
	return {"success": False, "error": str(e)}

	async def execute_user_script(self, page: Page, js_code: Union[str, List[str]]) -> Dict[str, Any]:
	"""
	Executes user-provided JavaScript code with proper error handling and context.

	Args:
	page: Playwright page object
	js_code: Single JavaScript string or list of JavaScript code strings

	Returns:
	Dict containing execution status and results/errors
	"""
	try:
	# Ensure the page is ready for script execution
	await page.wait_for_load_state('domcontentloaded')

	# Handle single script or multiple scripts
	if isinstance(js_code, str):
	scripts = [js_code]
	else:
	scripts = js_code

	results = []
	for script in scripts:
	try:
	# Execute the script and wait for network idle
	result = await page.evaluate(f"""
	(() => {{
	return new Promise((resolve) => {{
	try {{
	const result = (function() {{
	{script}
	}})();

	// If result is a promise, wait for it
	if (result instanceof Promise) {{
	result.then(() => {{
	// Wait a bit for any triggered effects
	setTimeout(() => resolve({{ success: true }}), 100);
	}}).catch(error => {{
	resolve({{
	success: false,
	error: error.toString(),
	stack: error.stack
	}});
	}});
	}} else {{
	// For non-promise results, still wait a bit for effects
	setTimeout(() => resolve({{ success: true }}), 100);
	}}
	}} catch (error) {{
	resolve({{
	success: false,
	error: error.toString(),
	stack: error.stack
	}});
	}}
	}});
	}})()
	""")

	# Wait for network idle after script execution
	t1 = time.time()
	await page.wait_for_load_state('domcontentloaded', timeout=5000)
	print("DOM content loaded after script execution in", time.time() - t1)

	t1 = time.time()
	await page.wait_for_load_state('networkidle', timeout=5000)
	print("Network idle after script execution in", time.time() - t1)

	results.append(result if result else {"success": True})

	except Error as e:
	# Handle Playwright-specific errors
	self.logger.error(
	message="Playwright execution error: {error}",
	tag="JS_EXEC",
	params={"error": str(e)}
	)
	results.append({"success": False, "error": str(e)})

	return {"success": True, "results": results}

	except Exception as e:
	self.logger.error(
	message="Script execution failed: {error}",
	tag="JS_EXEC",
	params={"error": str(e)}
	)
	return {"success": False, "error": str(e)}

	except Exception as e:
	self.logger.error(
	message="Script execution failed: {error}",
	tag="JS_EXEC",
	params={"error": str(e)}
	)
	return {"success": False, "error": str(e)}

	async def check_visibility(self, page):
	"""
	Checks if an element is visible on the page.

	Args:
	page: Playwright page object

	Returns:
	Boolean indicating visibility
	"""
	return await page.evaluate("""
	() => {
	const element = document.body;
	if (!element) return false;
	const style = window.getComputedStyle(element);
	const isVisible = style.display !== 'none' &&
	style.visibility !== 'hidden' &&
	style.opacity !== '0';
	return isVisible;
	}
	""")

	async def safe_scroll(self, page: Page, x: int, y: int, delay: float = 0.1):
	"""
	Safely scroll the page with rendering time.

	Args:
	page: Playwright page object
	x: Horizontal scroll position
	y: Vertical scroll position
	"""
	result = await self.csp_scroll_to(page, x, y)
	if result['success']:
	await page.wait_for_timeout(delay * 1000)
	return result

	async def csp_scroll_to(self, page: Page, x: int, y: int) -> Dict[str, Any]:
	"""
	Performs a CSP-compliant scroll operation and returns the result status.

	Args:
	page: Playwright page object
	x: Horizontal scroll position
	y: Vertical scroll position

	Returns:
	Dict containing scroll status and position information
	"""
	try:
	result = await page.evaluate(
	f"""() => {{
	try {{
	const startX = window.scrollX;
	const startY = window.scrollY;
	window.scrollTo({x}, {y});

	// Get final position after scroll
	const endX = window.scrollX;
	const endY = window.scrollY;

	return {{
	success: true,
	startPosition: {{ x: startX, y: startY }},
	endPosition: {{ x: endX, y: endY }},
	targetPosition: {{ x: {x}, y: {y} }},
	delta: {{
	x: Math.abs(endX - {x}),
	y: Math.abs(endY - {y})
	}}
	}};
	}} catch (e) {{
	return {{
	success: false,
	error: e.toString()
	}};
	}}
	}}"""
	)

	if not result['success']:
	self.logger.warning(
	message="Scroll operation failed: {error}",
	tag="SCROLL",
	params={"error": result.get('error')}
	)

	return result

	except Exception as e:
	self.logger.error(
	message="Failed to execute scroll: {error}",
	tag="SCROLL",
	params={"error": str(e)}
	)
	return {
	"success": False,
	"error": str(e)
	}

	async def get_page_dimensions(self, page: Page):
	"""
	Get the dimensions of the page.

	Args:
	page: Playwright page object

	Returns:
	Dict containing width and height of the page
	"""
	return await page.evaluate("""
	() => {
	const {scrollWidth, scrollHeight} = document.documentElement;
	return {width: scrollWidth, height: scrollHeight};
	}
	""")

	async def page_need_scroll(self, page: Page) -> bool:
	"""
	Determine whether the page need to scroll

	Args:
	page: Playwright page object

	Returns:
	bool: True if page needs scrolling
	"""
	try:
	need_scroll = await page.evaluate("""
	() => {
	const scrollHeight = document.documentElement.scrollHeight;
	const viewportHeight = window.innerHeight;
	return scrollHeight > viewportHeight;
	}
	""")
	return need_scroll
	except Exception as e:
	self.logger.warning(
	message="Failed to check scroll need: {error}. Defaulting to True for safety.",
	tag="SCROLL",
	params={"error": str(e)}
	)
	return True # Default to scrolling if check fails