Spaces:
Running
Running
| import requests | |
| from bs4 import BeautifulSoup | |
| import gradio as gr | |
| import tempfile | |
| import os | |
| def scrape_article(url): | |
| """ | |
| Function to scrape title and content from the given article URL. | |
| """ | |
| # Sending a request to the webpage | |
| response = requests.get(url) | |
| if response.status_code != 200: | |
| return "Failed to retrieve the webpage. Status code: " + str(response.status_code), "" | |
| # Parsing the webpage content | |
| soup = BeautifulSoup(response.text, 'html.parser') | |
| # Extracting the title of the article | |
| title = soup.find('h1') | |
| if title: | |
| title = title.text.strip() | |
| else: | |
| title = "No title found" | |
| # Extracting the content of the article | |
| article_content = [] | |
| for paragraph in soup.find_all('p'): | |
| article_content.append(paragraph.text.strip()) | |
| # Joining all paragraphs to form the article content | |
| content = "\n".join(article_content) | |
| return title, content | |
| def save_as_txt(title, content, url): | |
| """ | |
| Save the scraped article content to a temporary .txt file. | |
| """ | |
| # Extract filename from the URL | |
| filename = url.split('/')[-1] or 'article' | |
| filename = f"{filename}.txt" | |
| # Creating a temporary file | |
| temp_dir = tempfile.gettempdir() | |
| file_path = os.path.join(temp_dir, filename) | |
| # Writing the title and content to the file | |
| with open(file_path, "w", encoding="utf-8") as file: | |
| file.write("Title: " + title + "\n\n") | |
| file.write("Content:\n" + content) | |
| return file_path | |
| def scrape_and_download(url): | |
| """ | |
| Combine scraping and file saving for Gradio interface. | |
| """ | |
| title, content = scrape_article(url) | |
| if not content: | |
| return "No content found or failed to retrieve the page.", None | |
| file_path = save_as_txt(title, content, url) | |
| return f"Title: {title}\n\nContent:\n{content[:500]}... (truncated)", file_path | |
| # Gradio Interface | |
| description = "Input an article URL to scrape its title and content. A .txt file will be generated for download." | |
| with gr.Blocks() as demo: | |
| gr.Markdown("## Web Article Scraper with Download") | |
| gr.Markdown(description) | |
| url_input = gr.Textbox(label="Enter Article URL") | |
| output_text = gr.Textbox(label="Extracted Content Preview", interactive=False) | |
| download_button = gr.File(label="Download Article as .txt") | |
| submit_button = gr.Button("Scrape Article") | |
| # Linking components | |
| submit_button.click(scrape_and_download, inputs=url_input, outputs=[output_text, download_button]) | |
| # Launch the Gradio app | |
| demo.launch() | |