Spaces:

tensor-boy
/

ISE

Runtime error

ISE / utils /helpers.py

fikird

Complete rewrite of ISE with advanced RAG and OSINT capabilities

48922fa about 1 year ago

4.64 kB

	"""
	Common helper functions for the search engine.
	"""
	from typing import Dict, Any, List, Optional
	import re
	from datetime import datetime
	import hashlib
	import json

	def clean_text(text: str) -> str:
	"""Clean and normalize text content."""
	# Remove extra whitespace
	text = re.sub(r"\s+", " ", text)

	# Remove special characters
	text = re.sub(r"[^\w\s.,!?-]", "", text)

	return text.strip()

	def extract_entities(text: str) -> Dict[str, List[str]]:
	"""Extract basic entities from text."""
	entities = {
	"emails": [],
	"phones": [],
	"urls": [],
	"dates": []
	}

	# Extract emails
	email_pattern = r"[a-zA-Z0-9._%+-]+@[a-zA-Z0-9.-]+\.[a-zA-Z]{2,}"
	entities["emails"] = re.findall(email_pattern, text)

	# Extract phone numbers
	phone_pattern = r"\+?\d{1,4}?[-.\s]?\(?\d{1,3}?\)?[-.\s]?\d{1,4}[-.\s]?\d{1,4}[-.\s]?\d{1,9}"
	entities["phones"] = re.findall(phone_pattern, text)

	# Extract URLs
	url_pattern = r"https?://(?:[-\w.]\|(?:%[\da-fA-F]{2}))+"
	entities["urls"] = re.findall(url_pattern, text)

	# Extract dates
	date_pattern = r"\d{1,2}[-/]\d{1,2}[-/]\d{2,4}"
	entities["dates"] = re.findall(date_pattern, text)

	return entities

	def generate_hash(data: Any) -> str:
	"""Generate a hash for data deduplication."""
	if isinstance(data, (dict, list)):
	data = json.dumps(data, sort_keys=True)
	elif not isinstance(data, str):
	data = str(data)

	return hashlib.md5(data.encode()).hexdigest()

	def format_date(date_str: str) -> Optional[str]:
	"""Format date string to consistent format."""
	date_formats = [
	"%Y-%m-%d",
	"%d/%m/%Y",
	"%m/%d/%Y",
	"%Y/%m/%d",
	"%d-%m-%Y",
	"%m-%d-%Y"
	]

	for fmt in date_formats:
	try:
	date_obj = datetime.strptime(date_str, fmt)
	return date_obj.strftime("%Y-%m-%d")
	except ValueError:
	continue

	return None

	def extract_name_parts(full_name: str) -> Dict[str, str]:
	"""Extract first, middle, and last names."""
	parts = full_name.strip().split()

	if len(parts) == 1:
	return {
	"first_name": parts[0],
	"middle_name": None,
	"last_name": None
	}
	elif len(parts) == 2:
	return {
	"first_name": parts[0],
	"middle_name": None,
	"last_name": parts[1]
	}
	else:
	return {
	"first_name": parts[0],
	"middle_name": " ".join(parts[1:-1]),
	"last_name": parts[-1]
	}

	def generate_username_variants(name: str) -> List[str]:
	"""Generate possible username variants from a name."""
	name = name.lower()
	parts = name.split()
	variants = []

	if len(parts) >= 2:
	first, last = parts[0], parts[-1]
	variants.extend([
	first + last,
	first + "_" + last,
	first + "." + last,
	first[0] + last,
	first + last[0],
	last + first,
	last + "_" + first,
	last + "." + first
	])

	if len(parts) == 1:
	variants.extend([
	parts[0],
	parts[0] + "123",
	"the" + parts[0],
	"real" + parts[0]
	])

	return list(set(variants))

	def calculate_text_similarity(text1: str, text2: str) -> float:
	"""Calculate simple text similarity score."""
	# Convert to sets of words
	set1 = set(text1.lower().split())
	set2 = set(text2.lower().split())

	# Calculate Jaccard similarity
	intersection = len(set1.intersection(set2))
	union = len(set1.union(set2))

	return intersection / union if union > 0 else 0.0

	def extract_social_links(text: str) -> List[Dict[str, str]]:
	"""Extract social media profile links from text."""
	social_patterns = {
	"twitter": r"https?://(?:www\.)?twitter\.com/([a-zA-Z0-9_]+)",
	"facebook": r"https?://(?:www\.)?facebook\.com/([a-zA-Z0-9.]+)",
	"instagram": r"https?://(?:www\.)?instagram\.com/([a-zA-Z0-9_.]+)",
	"linkedin": r"https?://(?:www\.)?linkedin\.com/in/([a-zA-Z0-9_-]+)",
	"github": r"https?://(?:www\.)?github\.com/([a-zA-Z0-9_-]+)"
	}

	results = []
	for platform, pattern in social_patterns.items():
	matches = re.finditer(pattern, text)
	for match in matches:
	results.append({
	"platform": platform,
	"username": match.group(1),
	"url": match.group(0)
	})

	return results