dindizz commited on
Commit
b0e2f0e
·
verified ·
1 Parent(s): 044dc3a

Create app.py

Browse files
Files changed (1) hide show
  1. app.py +81 -0
app.py ADDED
@@ -0,0 +1,81 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import requests
2
+ from bs4 import BeautifulSoup
3
+ import gradio as gr
4
+ import tempfile
5
+ import os
6
+
7
+ def scrape_article(url):
8
+ """
9
+ Function to scrape title and content from the given article URL.
10
+ """
11
+ # Sending a request to the webpage
12
+ response = requests.get(url)
13
+ if response.status_code != 200:
14
+ return "Failed to retrieve the webpage. Status code: " + str(response.status_code), ""
15
+
16
+ # Parsing the webpage content
17
+ soup = BeautifulSoup(response.text, 'html.parser')
18
+
19
+ # Extracting the title of the article
20
+ title = soup.find('h1')
21
+ if title:
22
+ title = title.text.strip()
23
+ else:
24
+ title = "No title found"
25
+
26
+ # Extracting the content of the article
27
+ article_content = []
28
+ for paragraph in soup.find_all('p'):
29
+ article_content.append(paragraph.text.strip())
30
+
31
+ # Joining all paragraphs to form the article content
32
+ content = "\n".join(article_content)
33
+
34
+ return title, content
35
+
36
+ def save_as_txt(title, content, url):
37
+ """
38
+ Save the scraped article content to a temporary .txt file.
39
+ """
40
+ # Extract filename from the URL
41
+ filename = url.split('/')[-1] or 'article'
42
+ filename = f"{filename}.txt"
43
+
44
+ # Creating a temporary file
45
+ temp_dir = tempfile.gettempdir()
46
+ file_path = os.path.join(temp_dir, filename)
47
+
48
+ # Writing the title and content to the file
49
+ with open(file_path, "w", encoding="utf-8") as file:
50
+ file.write("Title: " + title + "\n\n")
51
+ file.write("Content:\n" + content)
52
+
53
+ return file_path
54
+
55
+ def scrape_and_download(url):
56
+ """
57
+ Combine scraping and file saving for Gradio interface.
58
+ """
59
+ title, content = scrape_article(url)
60
+ if not content:
61
+ return "No content found or failed to retrieve the page.", None
62
+
63
+ file_path = save_as_txt(title, content, url)
64
+ return f"Title: {title}\n\nContent:\n{content[:500]}... (truncated)", file_path
65
+
66
+ # Gradio Interface
67
+ description = "Input an article URL to scrape its title and content. A .txt file will be generated for download."
68
+ with gr.Blocks() as demo:
69
+ gr.Markdown("## Web Article Scraper with Download")
70
+ gr.Markdown(description)
71
+
72
+ url_input = gr.Textbox(label="Enter Article URL")
73
+ output_text = gr.Textbox(label="Extracted Content Preview", interactive=False)
74
+ download_button = gr.File(label="Download Article as .txt")
75
+ submit_button = gr.Button("Scrape Article")
76
+
77
+ # Linking components
78
+ submit_button.click(scrape_and_download, inputs=url_input, outputs=[output_text, download_button])
79
+
80
+ # Launch the Gradio app
81
+ demo.launch()