Spaces:

JimLin0704
/

Crawl4AI

Sleeping

amaye15

test

03c0888 11 months ago

1.74 kB


	import requests, base64, os

	data = {
	"urls": ["https://www.nbcnews.com/business"],
	"screenshot": True,
	}

	response = requests.post("https://crawl4ai.com/crawl", json=data)
	result = response.json()['results'][0]
	print(result.keys())
	# dict_keys(['url', 'html', 'success', 'cleaned_html', 'media',
	# 'links', 'screenshot', 'markdown', 'extracted_content',
	# 'metadata', 'error_message'])
	with open("screenshot.png", "wb") as f:
	f.write(base64.b64decode(result['screenshot']))

	# Example of filtering the content using CSS selectors
	data = {
	"urls": [
	"https://www.nbcnews.com/business"
	],
	"css_selector": "article",
	"screenshot": True,
	}

	# Example of executing a JS script on the page before extracting the content
	data = {
	"urls": [
	"https://www.nbcnews.com/business"
	],
	"screenshot": True,
	'js' : ["""
	const loadMoreButton = Array.from(document.querySelectorAll('button')).
	find(button => button.textContent.includes('Load More'));
	loadMoreButton && loadMoreButton.click();
	"""]
	}

	# Example of using a custom extraction strategy
	data = {
	"urls": [
	"https://www.nbcnews.com/business"
	],
	"extraction_strategy": "CosineStrategy",
	"extraction_strategy_args": {
	"semantic_filter": "inflation rent prices"
	},
	}

	# Example of using LLM to extract content
	data = {
	"urls": [
	"https://www.nbcnews.com/business"
	],
	"extraction_strategy": "LLMExtractionStrategy",
	"extraction_strategy_args": {
	"provider": "groq/llama3-8b-8192",
	"api_token": os.environ.get("GROQ_API_KEY"),
	"instruction": """I am interested in only financial news,
	and translate them in French."""
	},
	}