Spaces:
Sleeping
Sleeping
| """ | |
| Example demonstrating different extraction strategies with various input formats. | |
| This example shows how to: | |
| 1. Use different input formats (markdown, HTML, fit_markdown) | |
| 2. Work with JSON-based extractors (CSS and XPath) | |
| 3. Use LLM-based extraction with different input formats | |
| 4. Configure browser and crawler settings properly | |
| """ | |
| import asyncio | |
| import os | |
| from typing import Dict, Any | |
| from crawl4ai import AsyncWebCrawler, BrowserConfig, CrawlerRunConfig, CacheMode | |
| from crawl4ai.extraction_strategy import ( | |
| LLMExtractionStrategy, | |
| JsonCssExtractionStrategy, | |
| JsonXPathExtractionStrategy | |
| ) | |
| from crawl4ai.chunking_strategy import RegexChunking, IdentityChunking | |
| from crawl4ai.content_filter_strategy import PruningContentFilter | |
| from crawl4ai.markdown_generation_strategy import DefaultMarkdownGenerator | |
| async def run_extraction(crawler: AsyncWebCrawler, url: str, strategy, name: str): | |
| """Helper function to run extraction with proper configuration""" | |
| try: | |
| # Configure the crawler run settings | |
| config = CrawlerRunConfig( | |
| cache_mode=CacheMode.BYPASS, | |
| extraction_strategy=strategy, | |
| markdown_generator=DefaultMarkdownGenerator( | |
| content_filter=PruningContentFilter() # For fit_markdown support | |
| ) | |
| ) | |
| # Run the crawler | |
| result = await crawler.arun(url=url, config=config) | |
| if result.success: | |
| print(f"\n=== {name} Results ===") | |
| print(f"Extracted Content: {result.extracted_content}") | |
| print(f"Raw Markdown Length: {len(result.markdown_v2.raw_markdown)}") | |
| print(f"Citations Markdown Length: {len(result.markdown_v2.markdown_with_citations)}") | |
| else: | |
| print(f"Error in {name}: Crawl failed") | |
| except Exception as e: | |
| print(f"Error in {name}: {str(e)}") | |
| async def main(): | |
| # Example URL (replace with actual URL) | |
| url = "https://example.com/product-page" | |
| # Configure browser settings | |
| browser_config = BrowserConfig( | |
| headless=True, | |
| verbose=True | |
| ) | |
| # Initialize extraction strategies | |
| # 1. LLM Extraction with different input formats | |
| markdown_strategy = LLMExtractionStrategy( | |
| provider="openai/gpt-4o-mini", | |
| api_token=os.getenv("OPENAI_API_KEY"), | |
| instruction="Extract product information including name, price, and description" | |
| ) | |
| html_strategy = LLMExtractionStrategy( | |
| input_format="html", | |
| provider="openai/gpt-4o-mini", | |
| api_token=os.getenv("OPENAI_API_KEY"), | |
| instruction="Extract product information from HTML including structured data" | |
| ) | |
| fit_markdown_strategy = LLMExtractionStrategy( | |
| input_format="fit_markdown", | |
| provider="openai/gpt-4o-mini", | |
| api_token=os.getenv("OPENAI_API_KEY"), | |
| instruction="Extract product information from cleaned markdown" | |
| ) | |
| # 2. JSON CSS Extraction (automatically uses HTML input) | |
| css_schema = { | |
| "baseSelector": ".product", | |
| "fields": [ | |
| {"name": "title", "selector": "h1.product-title", "type": "text"}, | |
| {"name": "price", "selector": ".price", "type": "text"}, | |
| {"name": "description", "selector": ".description", "type": "text"} | |
| ] | |
| } | |
| css_strategy = JsonCssExtractionStrategy(schema=css_schema) | |
| # 3. JSON XPath Extraction (automatically uses HTML input) | |
| xpath_schema = { | |
| "baseSelector": "//div[@class='product']", | |
| "fields": [ | |
| {"name": "title", "selector": ".//h1[@class='product-title']/text()", "type": "text"}, | |
| {"name": "price", "selector": ".//span[@class='price']/text()", "type": "text"}, | |
| {"name": "description", "selector": ".//div[@class='description']/text()", "type": "text"} | |
| ] | |
| } | |
| xpath_strategy = JsonXPathExtractionStrategy(schema=xpath_schema) | |
| # Use context manager for proper resource handling | |
| async with AsyncWebCrawler(config=browser_config) as crawler: | |
| # Run all strategies | |
| await run_extraction(crawler, url, markdown_strategy, "Markdown LLM") | |
| await run_extraction(crawler, url, html_strategy, "HTML LLM") | |
| await run_extraction(crawler, url, fit_markdown_strategy, "Fit Markdown LLM") | |
| await run_extraction(crawler, url, css_strategy, "CSS Extraction") | |
| await run_extraction(crawler, url, xpath_strategy, "XPath Extraction") | |
| if __name__ == "__main__": | |
| asyncio.run(main()) | |