Spaces:
Sleeping
Sleeping
| # LLM Extraction with AsyncWebCrawler | |
| Crawl4AI's AsyncWebCrawler allows you to use Language Models (LLMs) to extract structured data or relevant content from web pages asynchronously. Below are two examples demonstrating how to use `LLMExtractionStrategy` for different purposes with the AsyncWebCrawler. | |
| ## Example 1: Extract Structured Data | |
| In this example, we use the `LLMExtractionStrategy` to extract structured data (model names and their fees) from the OpenAI pricing page. | |
| ```python | |
| import os | |
| import json | |
| import asyncio | |
| from crawl4ai import AsyncWebCrawler | |
| from crawl4ai.extraction_strategy import LLMExtractionStrategy | |
| from pydantic import BaseModel, Field | |
| class OpenAIModelFee(BaseModel): | |
| model_name: str = Field(..., description="Name of the OpenAI model.") | |
| input_fee: str = Field(..., description="Fee for input token for the OpenAI model.") | |
| output_fee: str = Field(..., description="Fee for output token for the OpenAI model.") | |
| async def extract_openai_fees(): | |
| url = 'https://openai.com/api/pricing/' | |
| async with AsyncWebCrawler(verbose=True) as crawler: | |
| result = await crawler.arun( | |
| url=url, | |
| word_count_threshold=1, | |
| extraction_strategy=LLMExtractionStrategy( | |
| provider="openai/gpt-4o", # Or use ollama like provider="ollama/nemotron" | |
| api_token=os.getenv('OPENAI_API_KEY'), | |
| schema=OpenAIModelFee.model_json_schema(), | |
| extraction_type="schema", | |
| instruction="From the crawled content, extract all mentioned model names along with their " | |
| "fees for input and output tokens. Make sure not to miss anything in the entire content. " | |
| 'One extracted model JSON format should look like this: ' | |
| '{ "model_name": "GPT-4", "input_fee": "US$10.00 / 1M tokens", "output_fee": "US$30.00 / 1M tokens" }' | |
| ), | |
| bypass_cache=True, | |
| ) | |
| model_fees = json.loads(result.extracted_content) | |
| print(f"Number of models extracted: {len(model_fees)}") | |
| with open(".data/openai_fees.json", "w", encoding="utf-8") as f: | |
| json.dump(model_fees, f, indent=2) | |
| asyncio.run(extract_openai_fees()) | |
| ``` | |
| ## Example 2: Extract Relevant Content | |
| In this example, we instruct the LLM to extract only content related to technology from the NBC News business page. | |
| ```python | |
| import os | |
| import json | |
| import asyncio | |
| from crawl4ai import AsyncWebCrawler | |
| from crawl4ai.extraction_strategy import LLMExtractionStrategy | |
| async def extract_tech_content(): | |
| async with AsyncWebCrawler(verbose=True) as crawler: | |
| result = await crawler.arun( | |
| url="https://www.nbcnews.com/business", | |
| extraction_strategy=LLMExtractionStrategy( | |
| provider="openai/gpt-4o", | |
| api_token=os.getenv('OPENAI_API_KEY'), | |
| instruction="Extract only content related to technology" | |
| ), | |
| bypass_cache=True, | |
| ) | |
| tech_content = json.loads(result.extracted_content) | |
| print(f"Number of tech-related items extracted: {len(tech_content)}") | |
| with open(".data/tech_content.json", "w", encoding="utf-8") as f: | |
| json.dump(tech_content, f, indent=2) | |
| asyncio.run(extract_tech_content()) | |
| ``` | |
| ## Advanced Usage: Combining JS Execution with LLM Extraction | |
| This example demonstrates how to combine JavaScript execution with LLM extraction to handle dynamic content: | |
| ```python | |
| async def extract_dynamic_content(): | |
| js_code = """ | |
| const loadMoreButton = Array.from(document.querySelectorAll('button')).find(button => button.textContent.includes('Load More')); | |
| if (loadMoreButton) { | |
| loadMoreButton.click(); | |
| await new Promise(resolve => setTimeout(resolve, 2000)); | |
| } | |
| """ | |
| wait_for = """ | |
| () => { | |
| const articles = document.querySelectorAll('article.tease-card'); | |
| return articles.length > 10; | |
| } | |
| """ | |
| async with AsyncWebCrawler(verbose=True) as crawler: | |
| result = await crawler.arun( | |
| url="https://www.nbcnews.com/business", | |
| js_code=js_code, | |
| wait_for=wait_for, | |
| css_selector="article.tease-card", | |
| extraction_strategy=LLMExtractionStrategy( | |
| provider="openai/gpt-4o", | |
| api_token=os.getenv('OPENAI_API_KEY'), | |
| instruction="Summarize each article, focusing on technology-related content" | |
| ), | |
| bypass_cache=True, | |
| ) | |
| summaries = json.loads(result.extracted_content) | |
| print(f"Number of summarized articles: {len(summaries)}") | |
| with open(".data/tech_summaries.json", "w", encoding="utf-8") as f: | |
| json.dump(summaries, f, indent=2) | |
| asyncio.run(extract_dynamic_content()) | |
| ``` | |
| ## Customizing LLM Provider | |
| Crawl4AI uses the `litellm` library under the hood, which allows you to use any LLM provider you want. Just pass the correct model name and API token: | |
| ```python | |
| extraction_strategy=LLMExtractionStrategy( | |
| provider="your_llm_provider/model_name", | |
| api_token="your_api_token", | |
| instruction="Your extraction instruction" | |
| ) | |
| ``` | |
| This flexibility allows you to integrate with various LLM providers and tailor the extraction process to your specific needs. | |
| ## Error Handling and Retries | |
| When working with external LLM APIs, it's important to handle potential errors and implement retry logic. Here's an example of how you might do this: | |
| ```python | |
| import asyncio | |
| from tenacity import retry, stop_after_attempt, wait_exponential | |
| class LLMExtractionError(Exception): | |
| pass | |
| @retry(stop=stop_after_attempt(3), wait=wait_exponential(multiplier=1, min=4, max=10)) | |
| async def extract_with_retry(crawler, url, extraction_strategy): | |
| try: | |
| result = await crawler.arun(url=url, extraction_strategy=extraction_strategy, bypass_cache=True) | |
| return json.loads(result.extracted_content) | |
| except Exception as e: | |
| raise LLMExtractionError(f"Failed to extract content: {str(e)}") | |
| async def main(): | |
| async with AsyncWebCrawler(verbose=True) as crawler: | |
| try: | |
| content = await extract_with_retry( | |
| crawler, | |
| "https://www.example.com", | |
| LLMExtractionStrategy( | |
| provider="openai/gpt-4o", | |
| api_token=os.getenv('OPENAI_API_KEY'), | |
| instruction="Extract and summarize main points" | |
| ) | |
| ) | |
| print("Extracted content:", content) | |
| except LLMExtractionError as e: | |
| print(f"Extraction failed after retries: {e}") | |
| asyncio.run(main()) | |
| ``` | |
| This example uses the `tenacity` library to implement a retry mechanism with exponential backoff, which can help handle temporary failures or rate limiting from the LLM API. |