Spaces:
Sleeping
Sleeping
| import os | |
| import time | |
| from crawl4ai.web_crawler import WebCrawler | |
| from crawl4ai.chunking_strategy import * | |
| from crawl4ai.extraction_strategy import * | |
| from crawl4ai.crawler_strategy import * | |
| from rich import print | |
| from rich.console import Console | |
| from functools import lru_cache | |
| console = Console() | |
| def create_crawler(): | |
| crawler = WebCrawler(verbose=True) | |
| crawler.warmup() | |
| return crawler | |
| def print_result(result): | |
| # Print each key in one line and just the first 10 characters of each one's value and three dots | |
| console.print(f"\t[bold]Result:[/bold]") | |
| for key, value in result.model_dump().items(): | |
| if isinstance(value, str) and value: | |
| console.print(f"\t{key}: [green]{value[:20]}...[/green]") | |
| if result.extracted_content: | |
| items = json.loads(result.extracted_content) | |
| print(f"\t[bold]{len(items)} blocks is extracted![/bold]") | |
| def cprint(message, press_any_key=False): | |
| console.print(message) | |
| if press_any_key: | |
| console.print("Press any key to continue...", style="") | |
| input() | |
| def basic_usage(crawler): | |
| cprint("π οΈ [bold cyan]Basic Usage: Simply provide a URL and let Crawl4ai do the magic![/bold cyan]") | |
| result = crawler.run(url="https://www.nbcnews.com/business", only_text = True) | |
| cprint("[LOG] π¦ [bold yellow]Basic crawl result:[/bold yellow]") | |
| print_result(result) | |
| def basic_usage_some_params(crawler): | |
| cprint("π οΈ [bold cyan]Basic Usage: Simply provide a URL and let Crawl4ai do the magic![/bold cyan]") | |
| result = crawler.run(url="https://www.nbcnews.com/business", word_count_threshold=1, only_text = True) | |
| cprint("[LOG] π¦ [bold yellow]Basic crawl result:[/bold yellow]") | |
| print_result(result) | |
| def screenshot_usage(crawler): | |
| cprint("\nπΈ [bold cyan]Let's take a screenshot of the page![/bold cyan]") | |
| result = crawler.run(url="https://www.nbcnews.com/business", screenshot=True) | |
| cprint("[LOG] π¦ [bold yellow]Screenshot result:[/bold yellow]") | |
| # Save the screenshot to a file | |
| with open("screenshot.png", "wb") as f: | |
| f.write(base64.b64decode(result.screenshot)) | |
| cprint("Screenshot saved to 'screenshot.png'!") | |
| print_result(result) | |
| def understanding_parameters(crawler): | |
| cprint("\nπ§ [bold cyan]Understanding 'bypass_cache' and 'include_raw_html' parameters:[/bold cyan]") | |
| cprint("By default, Crawl4ai caches the results of your crawls. This means that subsequent crawls of the same URL will be much faster! Let's see this in action.") | |
| # First crawl (reads from cache) | |
| cprint("1οΈβ£ First crawl (caches the result):", True) | |
| start_time = time.time() | |
| result = crawler.run(url="https://www.nbcnews.com/business") | |
| end_time = time.time() | |
| cprint(f"[LOG] π¦ [bold yellow]First crawl took {end_time - start_time} seconds and result (from cache):[/bold yellow]") | |
| print_result(result) | |
| # Force to crawl again | |
| cprint("2οΈβ£ Second crawl (Force to crawl again):", True) | |
| start_time = time.time() | |
| result = crawler.run(url="https://www.nbcnews.com/business", bypass_cache=True) | |
| end_time = time.time() | |
| cprint(f"[LOG] π¦ [bold yellow]Second crawl took {end_time - start_time} seconds and result (forced to crawl):[/bold yellow]") | |
| print_result(result) | |
| def add_chunking_strategy(crawler): | |
| # Adding a chunking strategy: RegexChunking | |
| cprint("\nπ§© [bold cyan]Let's add a chunking strategy: RegexChunking![/bold cyan]", True) | |
| cprint("RegexChunking is a simple chunking strategy that splits the text based on a given regex pattern. Let's see it in action!") | |
| result = crawler.run( | |
| url="https://www.nbcnews.com/business", | |
| chunking_strategy=RegexChunking(patterns=["\n\n"]) | |
| ) | |
| cprint("[LOG] π¦ [bold yellow]RegexChunking result:[/bold yellow]") | |
| print_result(result) | |
| # Adding another chunking strategy: NlpSentenceChunking | |
| cprint("\nπ [bold cyan]Time to explore another chunking strategy: NlpSentenceChunking![/bold cyan]", True) | |
| cprint("NlpSentenceChunking uses NLP techniques to split the text into sentences. Let's see how it performs!") | |
| result = crawler.run( | |
| url="https://www.nbcnews.com/business", | |
| chunking_strategy=NlpSentenceChunking() | |
| ) | |
| cprint("[LOG] π¦ [bold yellow]NlpSentenceChunking result:[/bold yellow]") | |
| print_result(result) | |
| def add_extraction_strategy(crawler): | |
| # Adding an extraction strategy: CosineStrategy | |
| cprint("\nπ§ [bold cyan]Let's get smarter with an extraction strategy: CosineStrategy![/bold cyan]", True) | |
| cprint("CosineStrategy uses cosine similarity to extract semantically similar blocks of text. Let's see it in action!") | |
| result = crawler.run( | |
| url="https://www.nbcnews.com/business", | |
| extraction_strategy=CosineStrategy(word_count_threshold=10, max_dist=0.2, linkage_method="ward", top_k=3, sim_threshold = 0.3, verbose=True) | |
| ) | |
| cprint("[LOG] π¦ [bold yellow]CosineStrategy result:[/bold yellow]") | |
| print_result(result) | |
| # Using semantic_filter with CosineStrategy | |
| cprint("You can pass other parameters like 'semantic_filter' to the CosineStrategy to extract semantically similar blocks of text. Let's see it in action!") | |
| result = crawler.run( | |
| url="https://www.nbcnews.com/business", | |
| extraction_strategy=CosineStrategy( | |
| semantic_filter="inflation rent prices", | |
| ) | |
| ) | |
| cprint("[LOG] π¦ [bold yellow]CosineStrategy result with semantic filter:[/bold yellow]") | |
| print_result(result) | |
| def add_llm_extraction_strategy(crawler): | |
| # Adding an LLM extraction strategy without instructions | |
| cprint("\nπ€ [bold cyan]Time to bring in the big guns: LLMExtractionStrategy without instructions![/bold cyan]", True) | |
| cprint("LLMExtractionStrategy uses a large language model to extract relevant information from the web page. Let's see it in action!") | |
| result = crawler.run( | |
| url="https://www.nbcnews.com/business", | |
| extraction_strategy=LLMExtractionStrategy(provider="openai/gpt-4o", api_token=os.getenv('OPENAI_API_KEY')) | |
| ) | |
| cprint("[LOG] π¦ [bold yellow]LLMExtractionStrategy (no instructions) result:[/bold yellow]") | |
| print_result(result) | |
| # Adding an LLM extraction strategy with instructions | |
| cprint("\nπ [bold cyan]Let's make it even more interesting: LLMExtractionStrategy with instructions![/bold cyan]", True) | |
| cprint("Let's say we are only interested in financial news. Let's see how LLMExtractionStrategy performs with instructions!") | |
| result = crawler.run( | |
| url="https://www.nbcnews.com/business", | |
| extraction_strategy=LLMExtractionStrategy( | |
| provider="openai/gpt-4o", | |
| api_token=os.getenv('OPENAI_API_KEY'), | |
| instruction="I am interested in only financial news" | |
| ) | |
| ) | |
| cprint("[LOG] π¦ [bold yellow]LLMExtractionStrategy (with instructions) result:[/bold yellow]") | |
| print_result(result) | |
| result = crawler.run( | |
| url="https://www.nbcnews.com/business", | |
| extraction_strategy=LLMExtractionStrategy( | |
| provider="openai/gpt-4o", | |
| api_token=os.getenv('OPENAI_API_KEY'), | |
| instruction="Extract only content related to technology" | |
| ) | |
| ) | |
| cprint("[LOG] π¦ [bold yellow]LLMExtractionStrategy (with technology instruction) result:[/bold yellow]") | |
| print_result(result) | |
| def targeted_extraction(crawler): | |
| # Using a CSS selector to extract only H2 tags | |
| cprint("\nπ― [bold cyan]Targeted extraction: Let's use a CSS selector to extract only H2 tags![/bold cyan]", True) | |
| result = crawler.run( | |
| url="https://www.nbcnews.com/business", | |
| css_selector="h2" | |
| ) | |
| cprint("[LOG] π¦ [bold yellow]CSS Selector (H2 tags) result:[/bold yellow]") | |
| print_result(result) | |
| def interactive_extraction(crawler): | |
| # Passing JavaScript code to interact with the page | |
| cprint("\nπ±οΈ [bold cyan]Let's get interactive: Passing JavaScript code to click 'Load More' button![/bold cyan]", True) | |
| cprint("In this example we try to click the 'Load More' button on the page using JavaScript code.") | |
| js_code = """ | |
| const loadMoreButton = Array.from(document.querySelectorAll('button')).find(button => button.textContent.includes('Load More')); | |
| loadMoreButton && loadMoreButton.click(); | |
| """ | |
| # crawler_strategy = LocalSeleniumCrawlerStrategy(js_code=js_code) | |
| # crawler = WebCrawler(crawler_strategy=crawler_strategy, always_by_pass_cache=True) | |
| result = crawler.run( | |
| url="https://www.nbcnews.com/business", | |
| js = js_code | |
| ) | |
| cprint("[LOG] π¦ [bold yellow]JavaScript Code (Load More button) result:[/bold yellow]") | |
| print_result(result) | |
| def multiple_scrip(crawler): | |
| # Passing JavaScript code to interact with the page | |
| cprint("\nπ±οΈ [bold cyan]Let's get interactive: Passing JavaScript code to click 'Load More' button![/bold cyan]", True) | |
| cprint("In this example we try to click the 'Load More' button on the page using JavaScript code.") | |
| js_code = [""" | |
| const loadMoreButton = Array.from(document.querySelectorAll('button')).find(button => button.textContent.includes('Load More')); | |
| loadMoreButton && loadMoreButton.click(); | |
| """] * 2 | |
| # crawler_strategy = LocalSeleniumCrawlerStrategy(js_code=js_code) | |
| # crawler = WebCrawler(crawler_strategy=crawler_strategy, always_by_pass_cache=True) | |
| result = crawler.run( | |
| url="https://www.nbcnews.com/business", | |
| js = js_code | |
| ) | |
| cprint("[LOG] π¦ [bold yellow]JavaScript Code (Load More button) result:[/bold yellow]") | |
| print_result(result) | |
| def using_crawler_hooks(crawler): | |
| # Example usage of the hooks for authentication and setting a cookie | |
| def on_driver_created(driver): | |
| print("[HOOK] on_driver_created") | |
| # Example customization: maximize the window | |
| driver.maximize_window() | |
| # Example customization: logging in to a hypothetical website | |
| driver.get('https://example.com/login') | |
| from selenium.webdriver.support.ui import WebDriverWait | |
| from selenium.webdriver.common.by import By | |
| from selenium.webdriver.support import expected_conditions as EC | |
| WebDriverWait(driver, 10).until( | |
| EC.presence_of_element_located((By.NAME, 'username')) | |
| ) | |
| driver.find_element(By.NAME, 'username').send_keys('testuser') | |
| driver.find_element(By.NAME, 'password').send_keys('password123') | |
| driver.find_element(By.NAME, 'login').click() | |
| WebDriverWait(driver, 10).until( | |
| EC.presence_of_element_located((By.ID, 'welcome')) | |
| ) | |
| # Add a custom cookie | |
| driver.add_cookie({'name': 'test_cookie', 'value': 'cookie_value'}) | |
| return driver | |
| def before_get_url(driver): | |
| print("[HOOK] before_get_url") | |
| # Example customization: add a custom header | |
| # Enable Network domain for sending headers | |
| driver.execute_cdp_cmd('Network.enable', {}) | |
| # Add a custom header | |
| driver.execute_cdp_cmd('Network.setExtraHTTPHeaders', {'headers': {'X-Test-Header': 'test'}}) | |
| return driver | |
| def after_get_url(driver): | |
| print("[HOOK] after_get_url") | |
| # Example customization: log the URL | |
| print(driver.current_url) | |
| return driver | |
| def before_return_html(driver, html): | |
| print("[HOOK] before_return_html") | |
| # Example customization: log the HTML | |
| print(len(html)) | |
| return driver | |
| cprint("\nπ [bold cyan]Using Crawler Hooks: Let's see how we can customize the crawler using hooks![/bold cyan]", True) | |
| crawler_strategy = LocalSeleniumCrawlerStrategy(verbose=True) | |
| crawler_strategy.set_hook('on_driver_created', on_driver_created) | |
| crawler_strategy.set_hook('before_get_url', before_get_url) | |
| crawler_strategy.set_hook('after_get_url', after_get_url) | |
| crawler_strategy.set_hook('before_return_html', before_return_html) | |
| crawler = WebCrawler(verbose=True, crawler_strategy=crawler_strategy) | |
| crawler.warmup() | |
| result = crawler.run(url="https://example.com") | |
| cprint("[LOG] π¦ [bold yellow]Crawler Hooks result:[/bold yellow]") | |
| print_result(result= result) | |
| def using_crawler_hooks_dleay_example(crawler): | |
| def delay(driver): | |
| print("Delaying for 5 seconds...") | |
| time.sleep(5) | |
| print("Resuming...") | |
| def create_crawler(): | |
| crawler_strategy = LocalSeleniumCrawlerStrategy(verbose=True) | |
| crawler_strategy.set_hook('after_get_url', delay) | |
| crawler = WebCrawler(verbose=True, crawler_strategy=crawler_strategy) | |
| crawler.warmup() | |
| return crawler | |
| cprint("\nπ [bold cyan]Using Crawler Hooks: Let's add a delay after fetching the url to make sure entire page is fetched.[/bold cyan]") | |
| crawler = create_crawler() | |
| result = crawler.run(url="https://google.com", bypass_cache=True) | |
| cprint("[LOG] π¦ [bold yellow]Crawler Hooks result:[/bold yellow]") | |
| print_result(result) | |
| def main(): | |
| cprint("π [bold green]Welcome to the Crawl4ai Quickstart Guide! Let's dive into some web crawling fun! π[/bold green]") | |
| cprint("β³οΈ [bold cyan]First Step: Create an instance of WebCrawler and call the `warmup()` function.[/bold cyan]") | |
| cprint("If this is the first time you're running Crawl4ai, this might take a few seconds to load required model files.") | |
| crawler = create_crawler() | |
| crawler.always_by_pass_cache = True | |
| basic_usage(crawler) | |
| # basic_usage_some_params(crawler) | |
| understanding_parameters(crawler) | |
| crawler.always_by_pass_cache = True | |
| screenshot_usage(crawler) | |
| add_chunking_strategy(crawler) | |
| add_extraction_strategy(crawler) | |
| add_llm_extraction_strategy(crawler) | |
| targeted_extraction(crawler) | |
| interactive_extraction(crawler) | |
| multiple_scrip(crawler) | |
| cprint("\nπ [bold green]Congratulations! You've made it through the Crawl4ai Quickstart Guide! Now go forth and crawl the web like a pro! πΈοΈ[/bold green]") | |
| if __name__ == "__main__": | |
| main() | |