Spaces:
Sleeping
Sleeping
| import requests, base64, os | |
| data = { | |
| "urls": ["https://www.nbcnews.com/business"], | |
| "screenshot": True, | |
| } | |
| response = requests.post("https://crawl4ai.com/crawl", json=data) | |
| result = response.json()['results'][0] | |
| print(result.keys()) | |
| # dict_keys(['url', 'html', 'success', 'cleaned_html', 'media', | |
| # 'links', 'screenshot', 'markdown', 'extracted_content', | |
| # 'metadata', 'error_message']) | |
| with open("screenshot.png", "wb") as f: | |
| f.write(base64.b64decode(result['screenshot'])) | |
| # Example of filtering the content using CSS selectors | |
| data = { | |
| "urls": [ | |
| "https://www.nbcnews.com/business" | |
| ], | |
| "css_selector": "article", | |
| "screenshot": True, | |
| } | |
| # Example of executing a JS script on the page before extracting the content | |
| data = { | |
| "urls": [ | |
| "https://www.nbcnews.com/business" | |
| ], | |
| "screenshot": True, | |
| 'js' : [""" | |
| const loadMoreButton = Array.from(document.querySelectorAll('button')). | |
| find(button => button.textContent.includes('Load More')); | |
| loadMoreButton && loadMoreButton.click(); | |
| """] | |
| } | |
| # Example of using a custom extraction strategy | |
| data = { | |
| "urls": [ | |
| "https://www.nbcnews.com/business" | |
| ], | |
| "extraction_strategy": "CosineStrategy", | |
| "extraction_strategy_args": { | |
| "semantic_filter": "inflation rent prices" | |
| }, | |
| } | |
| # Example of using LLM to extract content | |
| data = { | |
| "urls": [ | |
| "https://www.nbcnews.com/business" | |
| ], | |
| "extraction_strategy": "LLMExtractionStrategy", | |
| "extraction_strategy_args": { | |
| "provider": "groq/llama3-8b-8192", | |
| "api_token": os.environ.get("GROQ_API_KEY"), | |
| "instruction": """I am interested in only financial news, | |
| and translate them in French.""" | |
| }, | |
| } | |