Spaces:

dindizz
/

sitescraper

Running

App Files Files Community

sitescraper / app.py

dindizz

Create app.py

b0e2f0e verified 11 months ago

raw

history blame contribute delete

2.57 kB

	import requests
	from bs4 import BeautifulSoup
	import gradio as gr
	import tempfile
	import os

	def scrape_article(url):
	"""
	Function to scrape title and content from the given article URL.
	"""
	# Sending a request to the webpage
	response = requests.get(url)
	if response.status_code != 200:
	return "Failed to retrieve the webpage. Status code: " + str(response.status_code), ""

	# Parsing the webpage content
	soup = BeautifulSoup(response.text, 'html.parser')

	# Extracting the title of the article
	title = soup.find('h1')
	if title:
	title = title.text.strip()
	else:
	title = "No title found"

	# Extracting the content of the article
	article_content = []
	for paragraph in soup.find_all('p'):
	article_content.append(paragraph.text.strip())

	# Joining all paragraphs to form the article content
	content = "\n".join(article_content)

	return title, content

	def save_as_txt(title, content, url):
	"""
	Save the scraped article content to a temporary .txt file.
	"""
	# Extract filename from the URL
	filename = url.split('/')[-1] or 'article'
	filename = f"{filename}.txt"

	# Creating a temporary file
	temp_dir = tempfile.gettempdir()
	file_path = os.path.join(temp_dir, filename)

	# Writing the title and content to the file
	with open(file_path, "w", encoding="utf-8") as file:
	file.write("Title: " + title + "\n\n")
	file.write("Content:\n" + content)

	return file_path

	def scrape_and_download(url):
	"""
	Combine scraping and file saving for Gradio interface.
	"""
	title, content = scrape_article(url)
	if not content:
	return "No content found or failed to retrieve the page.", None

	file_path = save_as_txt(title, content, url)
	return f"Title: {title}\n\nContent:\n{content[:500]}... (truncated)", file_path

	# Gradio Interface
	description = "Input an article URL to scrape its title and content. A .txt file will be generated for download."
	with gr.Blocks() as demo:
	gr.Markdown("## Web Article Scraper with Download")
	gr.Markdown(description)

	url_input = gr.Textbox(label="Enter Article URL")
	output_text = gr.Textbox(label="Extracted Content Preview", interactive=False)
	download_button = gr.File(label="Download Article as .txt")
	submit_button = gr.Button("Scrape Article")

	# Linking components
	submit_button.click(scrape_and_download, inputs=url_input, outputs=[output_text, download_button])

	# Launch the Gradio app
	demo.launch()