Spaces:
Runtime error
Runtime error
| """ | |
| Common helper functions for the search engine. | |
| """ | |
| from typing import Dict, Any, List, Optional | |
| import re | |
| from datetime import datetime | |
| import hashlib | |
| import json | |
| def clean_text(text: str) -> str: | |
| """Clean and normalize text content.""" | |
| # Remove extra whitespace | |
| text = re.sub(r"\s+", " ", text) | |
| # Remove special characters | |
| text = re.sub(r"[^\w\s.,!?-]", "", text) | |
| return text.strip() | |
| def extract_entities(text: str) -> Dict[str, List[str]]: | |
| """Extract basic entities from text.""" | |
| entities = { | |
| "emails": [], | |
| "phones": [], | |
| "urls": [], | |
| "dates": [] | |
| } | |
| # Extract emails | |
| email_pattern = r"[a-zA-Z0-9._%+-]+@[a-zA-Z0-9.-]+\.[a-zA-Z]{2,}" | |
| entities["emails"] = re.findall(email_pattern, text) | |
| # Extract phone numbers | |
| phone_pattern = r"\+?\d{1,4}?[-.\s]?\(?\d{1,3}?\)?[-.\s]?\d{1,4}[-.\s]?\d{1,4}[-.\s]?\d{1,9}" | |
| entities["phones"] = re.findall(phone_pattern, text) | |
| # Extract URLs | |
| url_pattern = r"https?://(?:[-\w.]|(?:%[\da-fA-F]{2}))+" | |
| entities["urls"] = re.findall(url_pattern, text) | |
| # Extract dates | |
| date_pattern = r"\d{1,2}[-/]\d{1,2}[-/]\d{2,4}" | |
| entities["dates"] = re.findall(date_pattern, text) | |
| return entities | |
| def generate_hash(data: Any) -> str: | |
| """Generate a hash for data deduplication.""" | |
| if isinstance(data, (dict, list)): | |
| data = json.dumps(data, sort_keys=True) | |
| elif not isinstance(data, str): | |
| data = str(data) | |
| return hashlib.md5(data.encode()).hexdigest() | |
| def format_date(date_str: str) -> Optional[str]: | |
| """Format date string to consistent format.""" | |
| date_formats = [ | |
| "%Y-%m-%d", | |
| "%d/%m/%Y", | |
| "%m/%d/%Y", | |
| "%Y/%m/%d", | |
| "%d-%m-%Y", | |
| "%m-%d-%Y" | |
| ] | |
| for fmt in date_formats: | |
| try: | |
| date_obj = datetime.strptime(date_str, fmt) | |
| return date_obj.strftime("%Y-%m-%d") | |
| except ValueError: | |
| continue | |
| return None | |
| def extract_name_parts(full_name: str) -> Dict[str, str]: | |
| """Extract first, middle, and last names.""" | |
| parts = full_name.strip().split() | |
| if len(parts) == 1: | |
| return { | |
| "first_name": parts[0], | |
| "middle_name": None, | |
| "last_name": None | |
| } | |
| elif len(parts) == 2: | |
| return { | |
| "first_name": parts[0], | |
| "middle_name": None, | |
| "last_name": parts[1] | |
| } | |
| else: | |
| return { | |
| "first_name": parts[0], | |
| "middle_name": " ".join(parts[1:-1]), | |
| "last_name": parts[-1] | |
| } | |
| def generate_username_variants(name: str) -> List[str]: | |
| """Generate possible username variants from a name.""" | |
| name = name.lower() | |
| parts = name.split() | |
| variants = [] | |
| if len(parts) >= 2: | |
| first, last = parts[0], parts[-1] | |
| variants.extend([ | |
| first + last, | |
| first + "_" + last, | |
| first + "." + last, | |
| first[0] + last, | |
| first + last[0], | |
| last + first, | |
| last + "_" + first, | |
| last + "." + first | |
| ]) | |
| if len(parts) == 1: | |
| variants.extend([ | |
| parts[0], | |
| parts[0] + "123", | |
| "the" + parts[0], | |
| "real" + parts[0] | |
| ]) | |
| return list(set(variants)) | |
| def calculate_text_similarity(text1: str, text2: str) -> float: | |
| """Calculate simple text similarity score.""" | |
| # Convert to sets of words | |
| set1 = set(text1.lower().split()) | |
| set2 = set(text2.lower().split()) | |
| # Calculate Jaccard similarity | |
| intersection = len(set1.intersection(set2)) | |
| union = len(set1.union(set2)) | |
| return intersection / union if union > 0 else 0.0 | |
| def extract_social_links(text: str) -> List[Dict[str, str]]: | |
| """Extract social media profile links from text.""" | |
| social_patterns = { | |
| "twitter": r"https?://(?:www\.)?twitter\.com/([a-zA-Z0-9_]+)", | |
| "facebook": r"https?://(?:www\.)?facebook\.com/([a-zA-Z0-9.]+)", | |
| "instagram": r"https?://(?:www\.)?instagram\.com/([a-zA-Z0-9_.]+)", | |
| "linkedin": r"https?://(?:www\.)?linkedin\.com/in/([a-zA-Z0-9_-]+)", | |
| "github": r"https?://(?:www\.)?github\.com/([a-zA-Z0-9_-]+)" | |
| } | |
| results = [] | |
| for platform, pattern in social_patterns.items(): | |
| matches = re.finditer(pattern, text) | |
| for match in matches: | |
| results.append({ | |
| "platform": platform, | |
| "username": match.group(1), | |
| "url": match.group(0) | |
| }) | |
| return results | |