Final_Assignment_Template

Sleeping

Final_Assignment_Template / tools /fetch.py

Sirine MILI

[CHANGES]

21132c7 5 months ago

3.64 kB

	# this is asmolagent too to fetch html content from a url
	from smolagents import tool
	import requests
	from markdownify import markdownify as md
	from bs4 import BeautifulSoup
	from common.mylogger import save_file_with_timestamp, mylog

	@tool
	def fetch_webpage(url: str, convert_to_markdown: bool = True) -> str:
	"""
	Fetches the HTML content of a given URL.
	if markdown conversion is enabled, it will remove script and style and return the text content as markdown else return raw unfiltered HTML
	Args:
	url (str): The URL to fetch.
	convert_to_markdown (bool): If True, convert the HTML content to Markdown format. else return the raw HTML.
	Returns:
	str: The HTML content of the URL.
	"""
	content = None
	response = requests.get(url, timeout=30)
	if (convert_to_markdown):
	soup = BeautifulSoup(response.text, "html.parser")
	# remove script and style tags
	for script in soup(["script", "style"]):
	script.extract()

	# for wikipedia only keep the main content
	if "wikipedia.org" in url:
	main_content = soup.find("main",{"id":"content"})
	if main_content:
	content = md(str(main_content),strip=['script', 'style'], heading_style="ATX").strip()
	else:
	content = md(response.text,strip=['script', 'style'], heading_style="ATX").strip()
	else:
	content = response.text

	save_file_with_timestamp(content, "webpage", ".md" if convert_to_markdown else ".html")

	return content

	@tool
	# this tool allow web search on a local SearXNG instance
	def search_web(query: str, num_results: int = 5) -> list:
	"""
	Perform a web search using local SearXNG instance.
	Args:
	query (str): The search query.
	num_results (int): The number of results to return.
	Returns:
	list: A list of search results sorted by score with {url, title, content, score} for each result.
	"""
	# local metaserach engine searxng, run on localhost:8888
	searxng_url = "http://localhost:8888/search"
	params = {"q": query, "format": 'json'}
	response = requests.get(searxng_url, params=params)
	if response.status_code == 200:
	ret = response.json()
	# keep only the response'results' array
	results = ret.get("results", [])
	# keep only the first num_results
	results = results[:num_results]
	# for each result keep only the url, title and content ans score
	results = [
	{
	"url": result.get("url"),
	"title": result.get("title"),
	"content": result.get("content"),
	"score": result.get("score"),
	}
	for result in results
	]

	return results

	else:
	print(f"Error: {response.status_code}")
	return []

	if __name__ == "__main__":

	try:
	# Test the function
	query = "What is the capital of France?"
	results = search_web(query,3)
	print(results)
	except Exception as e:
	print(f"An error occurred: {e}")

	try:
	# Test the function
	video_id = "L1vXCYZAYYM" # Replace with your YouTube video ID
	video_url = "https://www.youtube.com/watch?v=" + video_id
	url = "https://en.wikipedia.org/wiki/Malko_Competition"
	# page_content = fetch_webpage(video_url)
	page_content = fetch_webpage(url, convert_to_markdown=True)
	print(page_content.encode("utf-8"))
	except Exception as e:
	print(f"An error occurred: {e}")