Spaces:

dindizz
/

sitescraper

Sleeping

File size: 2,566 Bytes

b0e2f0e

import requests
from bs4 import BeautifulSoup
import gradio as gr
import tempfile
import os

def scrape_article(url):
    """
    Function to scrape title and content from the given article URL.
    """
    # Sending a request to the webpage
    response = requests.get(url)
    if response.status_code != 200:
        return "Failed to retrieve the webpage. Status code: " + str(response.status_code), ""

    # Parsing the webpage content
    soup = BeautifulSoup(response.text, 'html.parser')

    # Extracting the title of the article
    title = soup.find('h1')
    if title:
        title = title.text.strip()
    else:
        title = "No title found"

    # Extracting the content of the article
    article_content = []
    for paragraph in soup.find_all('p'):
        article_content.append(paragraph.text.strip())

    # Joining all paragraphs to form the article content
    content = "\n".join(article_content)

    return title, content

def save_as_txt(title, content, url):
    """
    Save the scraped article content to a temporary .txt file.
    """
    # Extract filename from the URL
    filename = url.split('/')[-1] or 'article'
    filename = f"{filename}.txt"

    # Creating a temporary file
    temp_dir = tempfile.gettempdir()
    file_path = os.path.join(temp_dir, filename)
    
    # Writing the title and content to the file
    with open(file_path, "w", encoding="utf-8") as file:
        file.write("Title: " + title + "\n\n")
        file.write("Content:\n" + content)

    return file_path

def scrape_and_download(url):
    """
    Combine scraping and file saving for Gradio interface.
    """
    title, content = scrape_article(url)
    if not content:
        return "No content found or failed to retrieve the page.", None

    file_path = save_as_txt(title, content, url)
    return f"Title: {title}\n\nContent:\n{content[:500]}... (truncated)", file_path

# Gradio Interface
description = "Input an article URL to scrape its title and content. A .txt file will be generated for download."
with gr.Blocks() as demo:
    gr.Markdown("## Web Article Scraper with Download")
    gr.Markdown(description)
    
    url_input = gr.Textbox(label="Enter Article URL")
    output_text = gr.Textbox(label="Extracted Content Preview", interactive=False)
    download_button = gr.File(label="Download Article as .txt")
    submit_button = gr.Button("Scrape Article")

    # Linking components
    submit_button.click(scrape_and_download, inputs=url_input, outputs=[output_text, download_button])

# Launch the Gradio app
demo.launch()