sitescraper / app.py
dindizz's picture
Create app.py
b0e2f0e verified
import requests
from bs4 import BeautifulSoup
import gradio as gr
import tempfile
import os
def scrape_article(url):
"""
Function to scrape title and content from the given article URL.
"""
# Sending a request to the webpage
response = requests.get(url)
if response.status_code != 200:
return "Failed to retrieve the webpage. Status code: " + str(response.status_code), ""
# Parsing the webpage content
soup = BeautifulSoup(response.text, 'html.parser')
# Extracting the title of the article
title = soup.find('h1')
if title:
title = title.text.strip()
else:
title = "No title found"
# Extracting the content of the article
article_content = []
for paragraph in soup.find_all('p'):
article_content.append(paragraph.text.strip())
# Joining all paragraphs to form the article content
content = "\n".join(article_content)
return title, content
def save_as_txt(title, content, url):
"""
Save the scraped article content to a temporary .txt file.
"""
# Extract filename from the URL
filename = url.split('/')[-1] or 'article'
filename = f"{filename}.txt"
# Creating a temporary file
temp_dir = tempfile.gettempdir()
file_path = os.path.join(temp_dir, filename)
# Writing the title and content to the file
with open(file_path, "w", encoding="utf-8") as file:
file.write("Title: " + title + "\n\n")
file.write("Content:\n" + content)
return file_path
def scrape_and_download(url):
"""
Combine scraping and file saving for Gradio interface.
"""
title, content = scrape_article(url)
if not content:
return "No content found or failed to retrieve the page.", None
file_path = save_as_txt(title, content, url)
return f"Title: {title}\n\nContent:\n{content[:500]}... (truncated)", file_path
# Gradio Interface
description = "Input an article URL to scrape its title and content. A .txt file will be generated for download."
with gr.Blocks() as demo:
gr.Markdown("## Web Article Scraper with Download")
gr.Markdown(description)
url_input = gr.Textbox(label="Enter Article URL")
output_text = gr.Textbox(label="Extracted Content Preview", interactive=False)
download_button = gr.File(label="Download Article as .txt")
submit_button = gr.Button("Scrape Article")
# Linking components
submit_button.click(scrape_and_download, inputs=url_input, outputs=[output_text, download_button])
# Launch the Gradio app
demo.launch()