Spaces:

wgcv
/

Tidy-Tabs-Titles

Sleeping

Tidy-Tabs-Titles / loadhtml.py

first test

b7a1a13 over 1 year ago

1.36 kB

	import requests
	from bs4 import BeautifulSoup
	def get_content(url):
	# Make a request to Prerender.io
	response = requests.get(url, headers={'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/127.0.0.0 Safari/537.36'})

	# Process the response
	html_content = response.text

	# Parse the HTML content
	soup = BeautifulSoup(html_content, 'html.parser')

	# Extract the title
	title = soup.title.string if soup.title else ''

	# Extract meta description
	meta_description = soup.find('meta', attrs={'name': 'description'})
	description = meta_description['content'] if meta_description else ''

	# Extract headings
	headings = [h.get_text() for h in soup.find_all(['h1', 'h2', 'h3'])]

	# Extract main paragraphs
	paragraphs = [p.get_text() for p in soup.find_all('p')]
	headings = ' '.join(headings)
	paragraphs = ' '.join(paragraphs)
	headings = headings.replace("\n", "").replace("\t", "").replace(",", ";")
	paragraphs = headings.replace("\n", "").replace("\t", "").replace(",", ";")
	description = description.replace(",", ";")
	title = title.replace(",", ";")
	text = "[title] "+ title + "\n [description]" + description
	# return {"url": url, "title":title, "description": description, "paragraphs": paragraphs, "headings":headings, "text": text , "summary": ""}
	return text, title