Spaces:

dindizz
/

sitescraper

Running

App Files Files Community

dindizz commited on Dec 17, 2024

Commit

b0e2f0e

verified ·

1 Parent(s): 044dc3a

Create app.py

Browse files

Files changed (1) hide show

app.py +81 -0

app.py ADDED Viewed

	@@ -0,0 +1,81 @@

+import requests
+from bs4 import BeautifulSoup
+import gradio as gr
+import tempfile
+import os
+def scrape_article(url):
+    """
+    Function to scrape title and content from the given article URL.
+    """
+    # Sending a request to the webpage
+    response = requests.get(url)
+    if response.status_code != 200:
+        return "Failed to retrieve the webpage. Status code: " + str(response.status_code), ""
+    # Parsing the webpage content
+    soup = BeautifulSoup(response.text, 'html.parser')
+    # Extracting the title of the article
+    title = soup.find('h1')
+    if title:
+        title = title.text.strip()
+    else:
+        title = "No title found"
+    # Extracting the content of the article
+    article_content = []
+    for paragraph in soup.find_all('p'):
+        article_content.append(paragraph.text.strip())
+    # Joining all paragraphs to form the article content
+    content = "\n".join(article_content)
+    return title, content
+def save_as_txt(title, content, url):
+    """
+    Save the scraped article content to a temporary .txt file.
+    """
+    # Extract filename from the URL
+    filename = url.split('/')[-1] or 'article'
+    filename = f"{filename}.txt"
+    # Creating a temporary file
+    temp_dir = tempfile.gettempdir()
+    file_path = os.path.join(temp_dir, filename)
+    # Writing the title and content to the file
+    with open(file_path, "w", encoding="utf-8") as file:
+        file.write("Title: " + title + "\n\n")
+        file.write("Content:\n" + content)
+    return file_path
+def scrape_and_download(url):
+    """
+    Combine scraping and file saving for Gradio interface.
+    """
+    title, content = scrape_article(url)
+    if not content:
+        return "No content found or failed to retrieve the page.", None
+    file_path = save_as_txt(title, content, url)
+    return f"Title: {title}\n\nContent:\n{content[:500]}... (truncated)", file_path
+# Gradio Interface
+description = "Input an article URL to scrape its title and content. A .txt file will be generated for download."
+with gr.Blocks() as demo:
+    gr.Markdown("## Web Article Scraper with Download")
+    gr.Markdown(description)
+    url_input = gr.Textbox(label="Enter Article URL")
+    output_text = gr.Textbox(label="Extracted Content Preview", interactive=False)
+    download_button = gr.File(label="Download Article as .txt")
+    submit_button = gr.Button("Scrape Article")
+    # Linking components
+    submit_button.click(scrape_and_download, inputs=url_input, outputs=[output_text, download_button])
+# Launch the Gradio app
+demo.launch()