Spaces:
Sleeping
Sleeping
| import requests | |
| from bs4 import BeautifulSoup | |
| def get_content(url): | |
| # Make a request to Prerender.io | |
| response = requests.get(url, headers={'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/127.0.0.0 Safari/537.36'}) | |
| # Process the response | |
| html_content = response.text | |
| # Parse the HTML content | |
| soup = BeautifulSoup(html_content, 'html.parser') | |
| # Extract the title | |
| title = soup.title.string if soup.title else '' | |
| # Extract meta description | |
| meta_description = soup.find('meta', attrs={'name': 'description'}) | |
| description = meta_description['content'] if meta_description else '' | |
| # Extract headings | |
| headings = [h.get_text() for h in soup.find_all(['h1', 'h2', 'h3'])] | |
| # Extract main paragraphs | |
| paragraphs = [p.get_text() for p in soup.find_all('p')] | |
| headings = ' '.join(headings) | |
| paragraphs = ' '.join(paragraphs) | |
| headings = headings.replace("\n", "").replace("\t", "").replace(",", ";") | |
| paragraphs = headings.replace("\n", "").replace("\t", "").replace(",", ";") | |
| description = description.replace(",", ";") | |
| title = title.replace(",", ";") | |
| text = "[title] "+ title + "\n [description]" + description | |
| # return {"url": url, "title":title, "description": description, "paragraphs": paragraphs, "headings":headings, "text": text , "summary": ""} | |
| return text, title |