# In custom_tools.py import requests from bs4 import BeautifulSoup from pydantic import BaseModel, Field from typing import List, Type, Optional from crewai.tools import BaseTool from playwright.sync_api import sync_playwright class AdvancedScrapingToolSchema(BaseModel): """Input schema for the AdvancedScrapingTool.""" website_url: str = Field(..., description="A URL completa do site para fazer o scrape.") base_selector: str = Field(..., description="O seletor CSS principal para extrair o bloco de conteúdo inicial. Ex: '.page-content'") keep_selectors: Optional[List[str]] = Field(None, description="Uma lista de seletores CSS para manter no resultado final. A extração será focada nestes elementos. Ex: ['.title', '.description']") remove_selectors: Optional[List[str]] = Field(None, description="Uma lista de seletores CSS para remover do conteúdo extraído. Ex: ['.ads', '.hide']") class AdvancedScrapingTool(BaseTool): name: str = "Scrape and Filter Website Content" description: str = "Uma ferramenta poderosa que extrai um bloco de conteúdo de um site usando um seletor base e, em seguida, filtra esse conteúdo, mantendo ou removendo elementos específicos. Lida com conteúdo dinâmico carregado por JavaScript." args_schema: Type[BaseModel] = AdvancedScrapingToolSchema def _run(self, website_url: str, base_selector: str = 'body', keep_selectors: Optional[List[str]] = None, remove_selectors: Optional[List[str]] = None) -> str: try: with sync_playwright() as p: browser = p.chromium.launch(headless=True) # --- START: NEW ANTI-BOT BYPASS LOGIC --- # Create a browser context that looks like a real user's browser context = browser.new_context( user_agent="Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/91.0.4472.124 Safari/537.36", viewport={"width": 1920, "height": 1080} ) page = context.new_page() # --- END: NEW ANTI-BOT BYPASS LOGIC --- # Increase the default timeout for navigation to handle challenge pages page.goto(website_url, wait_until="domcontentloaded", timeout=30000) # Wait for the main container element to be ready print(f"Waiting for base selector: '{base_selector}'") page.wait_for_selector(base_selector, timeout=20000) if keep_selectors: print(f"Waiting for keep selectors: {keep_selectors}") for selector in keep_selectors: page.wait_for_selector(selector, timeout=15000) html_content = page.content() browser.close() soup = BeautifulSoup(html_content, 'lxml') base_content = soup.select_one(base_selector) if not base_content: return f"Erro: O seletor base '{base_selector}' não foi encontrado na página." if remove_selectors: for selector in remove_selectors: for element in base_content.select(selector): element.decompose() if keep_selectors: final_content = [] for selector in keep_selectors: elements = base_content.select(selector) for element in elements: final_content.append(element.prettify()) if not final_content: return "Nenhum dos 'keep_selectors' foi encontrado dentro do conteúdo extraído após a limpeza." return "\n".join(final_content) else: return base_content.prettify() except Exception as e: return f"Ocorreu um erro inesperado com Playwright ou BeautifulSoup: {e}"