Spaces:
Runtime error
Runtime error
| """ | |
| Web scraping and processing utilities. | |
| """ | |
| from typing import Dict, Any, List, Optional | |
| import requests | |
| from bs4 import BeautifulSoup | |
| import re | |
| from urllib.parse import urlparse, urljoin | |
| from tenacity import retry, stop_after_attempt, wait_exponential | |
| class WebUtils: | |
| def __init__(self): | |
| self.session = requests.Session() | |
| self.session.headers.update({ | |
| "User-Agent": "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36" | |
| }) | |
| async def fetch_url(self, url: str, timeout: int = 10) -> Optional[str]: | |
| """Fetch content from a URL.""" | |
| try: | |
| response = self.session.get(url, timeout=timeout) | |
| response.raise_for_status() | |
| return response.text | |
| except Exception as e: | |
| print(f"Error fetching {url}: {e}") | |
| return None | |
| def extract_text(self, html: str) -> str: | |
| """Extract clean text from HTML content.""" | |
| soup = BeautifulSoup(html, "html.parser") | |
| # Remove unwanted elements | |
| for element in soup(["script", "style", "nav", "footer", "header"]): | |
| element.decompose() | |
| # Get text and clean it | |
| text = soup.get_text(separator="\n", strip=True) | |
| # Remove excessive newlines | |
| text = re.sub(r"\n\s*\n", "\n\n", text) | |
| return text.strip() | |
| def extract_metadata(self, html: str, url: str) -> Dict[str, Any]: | |
| """Extract metadata from HTML content.""" | |
| soup = BeautifulSoup(html, "html.parser") | |
| metadata = { | |
| "url": url, | |
| "title": None, | |
| "description": None, | |
| "keywords": None, | |
| "author": None, | |
| "published_date": None | |
| } | |
| # Extract title | |
| metadata["title"] = ( | |
| soup.title.string if soup.title else None | |
| ) | |
| # Extract meta tags | |
| meta_tags = soup.find_all("meta") | |
| for tag in meta_tags: | |
| # Description | |
| if tag.get("name", "").lower() == "description": | |
| metadata["description"] = tag.get("content") | |
| # Keywords | |
| elif tag.get("name", "").lower() == "keywords": | |
| metadata["keywords"] = tag.get("content") | |
| # Author | |
| elif tag.get("name", "").lower() == "author": | |
| metadata["author"] = tag.get("content") | |
| # Published date | |
| elif tag.get("name", "").lower() in ["published_time", "publication_date"]: | |
| metadata["published_date"] = tag.get("content") | |
| return metadata | |
| def extract_links(self, html: str, base_url: str) -> List[str]: | |
| """Extract all links from HTML content.""" | |
| soup = BeautifulSoup(html, "html.parser") | |
| links = [] | |
| for link in soup.find_all("a"): | |
| href = link.get("href") | |
| if href: | |
| # Convert relative URLs to absolute | |
| absolute_url = urljoin(base_url, href) | |
| # Only include http(s) URLs | |
| if absolute_url.startswith(("http://", "https://")): | |
| links.append(absolute_url) | |
| return list(set(links)) # Remove duplicates | |
| def is_valid_url(self, url: str) -> bool: | |
| """Check if a URL is valid.""" | |
| try: | |
| result = urlparse(url) | |
| return all([result.scheme, result.netloc]) | |
| except Exception: | |
| return False | |
| def clean_url(self, url: str) -> str: | |
| """Clean and normalize a URL.""" | |
| # Remove tracking parameters | |
| parsed = urlparse(url) | |
| path = parsed.path | |
| # Remove common tracking parameters | |
| query_params = [] | |
| if parsed.query: | |
| for param in parsed.query.split("&"): | |
| if "=" in param: | |
| key = param.split("=")[0].lower() | |
| if not any(track in key for track in ["utm_", "ref_", "source", "campaign"]): | |
| query_params.append(param) | |
| # Rebuild URL | |
| clean_url = f"{parsed.scheme}://{parsed.netloc}{path}" | |
| if query_params: | |
| clean_url += "?" + "&".join(query_params) | |
| return clean_url | |