Spaces:

tensor-boy
/

ISE

Runtime error

ISE / utils /web.py

fikird

Complete rewrite of ISE with advanced RAG and OSINT capabilities

48922fa 12 months ago

4.43 kB

	"""
	Web scraping and processing utilities.
	"""
	from typing import Dict, Any, List, Optional
	import requests
	from bs4 import BeautifulSoup
	import re
	from urllib.parse import urlparse, urljoin
	from tenacity import retry, stop_after_attempt, wait_exponential

	class WebUtils:
	def __init__(self):
	self.session = requests.Session()
	self.session.headers.update({
	"User-Agent": "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36"
	})

	@retry(stop=stop_after_attempt(3), wait=wait_exponential(multiplier=1, min=4, max=10))
	async def fetch_url(self, url: str, timeout: int = 10) -> Optional[str]:
	"""Fetch content from a URL."""
	try:
	response = self.session.get(url, timeout=timeout)
	response.raise_for_status()
	return response.text
	except Exception as e:
	print(f"Error fetching {url}: {e}")
	return None

	def extract_text(self, html: str) -> str:
	"""Extract clean text from HTML content."""
	soup = BeautifulSoup(html, "html.parser")

	# Remove unwanted elements
	for element in soup(["script", "style", "nav", "footer", "header"]):
	element.decompose()

	# Get text and clean it
	text = soup.get_text(separator="\n", strip=True)

	# Remove excessive newlines
	text = re.sub(r"\n\s*\n", "\n\n", text)

	return text.strip()

	def extract_metadata(self, html: str, url: str) -> Dict[str, Any]:
	"""Extract metadata from HTML content."""
	soup = BeautifulSoup(html, "html.parser")

	metadata = {
	"url": url,
	"title": None,
	"description": None,
	"keywords": None,
	"author": None,
	"published_date": None
	}

	# Extract title
	metadata["title"] = (
	soup.title.string if soup.title else None
	)

	# Extract meta tags
	meta_tags = soup.find_all("meta")
	for tag in meta_tags:
	# Description
	if tag.get("name", "").lower() == "description":
	metadata["description"] = tag.get("content")

	# Keywords
	elif tag.get("name", "").lower() == "keywords":
	metadata["keywords"] = tag.get("content")

	# Author
	elif tag.get("name", "").lower() == "author":
	metadata["author"] = tag.get("content")

	# Published date
	elif tag.get("name", "").lower() in ["published_time", "publication_date"]:
	metadata["published_date"] = tag.get("content")

	return metadata

	def extract_links(self, html: str, base_url: str) -> List[str]:
	"""Extract all links from HTML content."""
	soup = BeautifulSoup(html, "html.parser")
	links = []

	for link in soup.find_all("a"):
	href = link.get("href")
	if href:
	# Convert relative URLs to absolute
	absolute_url = urljoin(base_url, href)
	# Only include http(s) URLs
	if absolute_url.startswith(("http://", "https://")):
	links.append(absolute_url)

	return list(set(links)) # Remove duplicates

	def is_valid_url(self, url: str) -> bool:
	"""Check if a URL is valid."""
	try:
	result = urlparse(url)
	return all([result.scheme, result.netloc])
	except Exception:
	return False

	def clean_url(self, url: str) -> str:
	"""Clean and normalize a URL."""
	# Remove tracking parameters
	parsed = urlparse(url)
	path = parsed.path

	# Remove common tracking parameters
	query_params = []
	if parsed.query:
	for param in parsed.query.split("&"):
	if "=" in param:
	key = param.split("=")[0].lower()
	if not any(track in key for track in ["utm_", "ref_", "source", "campaign"]):
	query_params.append(param)

	# Rebuild URL
	clean_url = f"{parsed.scheme}://{parsed.netloc}{path}"
	if query_params:
	clean_url += "?" + "&".join(query_params)

	return clean_url