Spaces:
Runtime error
Runtime error
fikird
commited on
Commit
·
a3440c5
1
Parent(s):
48922fa
feat: Enhanced search engine with caching and metadata
Browse files- Added result caching with TTL
- Improved content extraction
- Enhanced metadata collection
- Optimized dependencies
- Removed unnecessary files
- apt.txt +0 -8
- engines/search.py +159 -13
- osint_engine.py +0 -489
- packages.txt +0 -25
- requirements.txt +32 -35
- search_engine.py +0 -219
- space.yml +0 -11
apt.txt
DELETED
|
@@ -1,8 +0,0 @@
|
|
| 1 |
-
python3-dev
|
| 2 |
-
python3-pip
|
| 3 |
-
build-essential
|
| 4 |
-
gcc
|
| 5 |
-
g++
|
| 6 |
-
git
|
| 7 |
-
cmake
|
| 8 |
-
libgomp1
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
engines/search.py
CHANGED
|
@@ -1,5 +1,5 @@
|
|
| 1 |
"""
|
| 2 |
-
RAG-based search engine with
|
| 3 |
"""
|
| 4 |
from typing import List, Dict, Any, Optional
|
| 5 |
import asyncio
|
|
@@ -13,6 +13,12 @@ from googlesearch import search as gsearch
|
|
| 13 |
import requests
|
| 14 |
from bs4 import BeautifulSoup
|
| 15 |
from tenacity import retry, stop_after_attempt, wait_exponential
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 16 |
|
| 17 |
class SearchEngine:
|
| 18 |
def __init__(self):
|
|
@@ -23,12 +29,42 @@ class SearchEngine:
|
|
| 23 |
chunk_size=500,
|
| 24 |
chunk_overlap=50
|
| 25 |
)
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 26 |
|
| 27 |
@retry(stop=stop_after_attempt(3), wait=wait_exponential(multiplier=1, min=4, max=10))
|
| 28 |
async def search_web(self, query: str, max_results: int = 10) -> List[Dict[str, str]]:
|
| 29 |
"""Perform web search using multiple search engines."""
|
| 30 |
results = []
|
| 31 |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 32 |
# DuckDuckGo Search
|
| 33 |
try:
|
| 34 |
with DDGS() as ddgs:
|
|
@@ -44,8 +80,26 @@ class SearchEngine:
|
|
| 44 |
except Exception as e:
|
| 45 |
print(f"Google search error: {e}")
|
| 46 |
|
|
|
|
| 47 |
return results[:max_results]
|
| 48 |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 49 |
@retry(stop=stop_after_attempt(3), wait=wait_exponential(multiplier=1, min=4, max=10))
|
| 50 |
async def fetch_content(self, url: str) -> Optional[str]:
|
| 51 |
"""Fetch and extract content from a webpage."""
|
|
@@ -56,25 +110,90 @@ class SearchEngine:
|
|
| 56 |
response = requests.get(url, headers=headers, timeout=10)
|
| 57 |
response.raise_for_status()
|
| 58 |
|
|
|
|
| 59 |
soup = BeautifulSoup(response.text, "html.parser")
|
| 60 |
|
| 61 |
# Remove unwanted elements
|
| 62 |
-
for element in soup(["script", "style", "nav", "footer", "header"]):
|
| 63 |
element.decompose()
|
| 64 |
|
| 65 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 66 |
return text
|
| 67 |
except Exception as e:
|
| 68 |
print(f"Error fetching {url}: {e}")
|
| 69 |
return None
|
| 70 |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 71 |
async def process_search_results(self, query: str) -> Dict[str, Any]:
|
| 72 |
"""Process search results and create a RAG-based answer."""
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 73 |
# Perform web search
|
| 74 |
search_results = await self.search_web(query)
|
| 75 |
|
| 76 |
# Fetch content from search results
|
| 77 |
documents = []
|
|
|
|
|
|
|
| 78 |
for result in search_results:
|
| 79 |
url = result.get("link")
|
| 80 |
if not url:
|
|
@@ -84,17 +203,28 @@ class SearchEngine:
|
|
| 84 |
if content:
|
| 85 |
# Split content into chunks
|
| 86 |
chunks = self.text_splitter.split_text(content)
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 87 |
for chunk in chunks:
|
| 88 |
doc = Document(
|
| 89 |
page_content=chunk,
|
| 90 |
-
metadata=
|
| 91 |
)
|
| 92 |
documents.append(doc)
|
| 93 |
|
| 94 |
if not documents:
|
| 95 |
return {
|
| 96 |
"answer": "I couldn't find any relevant information.",
|
| 97 |
-
"sources": []
|
|
|
|
| 98 |
}
|
| 99 |
|
| 100 |
# Create vector store
|
|
@@ -109,18 +239,33 @@ class SearchEngine:
|
|
| 109 |
# Get relevant documents
|
| 110 |
relevant_docs = chain.retriever.get_relevant_documents(query)
|
| 111 |
|
| 112 |
-
#
|
| 113 |
sources = []
|
| 114 |
content = []
|
| 115 |
-
|
| 116 |
-
if doc.metadata["source"] not in sources:
|
| 117 |
-
sources.append(doc.metadata["source"])
|
| 118 |
-
content.append(doc.page_content)
|
| 119 |
|
| 120 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 121 |
"answer": "\n\n".join(content),
|
| 122 |
-
"sources": sources
|
|
|
|
| 123 |
}
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 124 |
|
| 125 |
async def search(self, query: str) -> Dict[str, Any]:
|
| 126 |
"""Main search interface."""
|
|
@@ -129,5 +274,6 @@ class SearchEngine:
|
|
| 129 |
except Exception as e:
|
| 130 |
return {
|
| 131 |
"answer": f"An error occurred: {str(e)}",
|
| 132 |
-
"sources": []
|
|
|
|
| 133 |
}
|
|
|
|
| 1 |
"""
|
| 2 |
+
Advanced RAG-based search engine with multi-source intelligence.
|
| 3 |
"""
|
| 4 |
from typing import List, Dict, Any, Optional
|
| 5 |
import asyncio
|
|
|
|
| 13 |
import requests
|
| 14 |
from bs4 import BeautifulSoup
|
| 15 |
from tenacity import retry, stop_after_attempt, wait_exponential
|
| 16 |
+
import json
|
| 17 |
+
import time
|
| 18 |
+
from datetime import datetime, timedelta
|
| 19 |
+
import hashlib
|
| 20 |
+
from urllib.parse import urlparse
|
| 21 |
+
import re
|
| 22 |
|
| 23 |
class SearchEngine:
|
| 24 |
def __init__(self):
|
|
|
|
| 29 |
chunk_size=500,
|
| 30 |
chunk_overlap=50
|
| 31 |
)
|
| 32 |
+
self.cache = {}
|
| 33 |
+
self.cache_ttl = timedelta(hours=24)
|
| 34 |
+
self.search_delay = 2 # seconds between searches
|
| 35 |
+
self.last_search_time = datetime.min
|
| 36 |
+
|
| 37 |
+
def _get_cache_key(self, query: str, **kwargs) -> str:
|
| 38 |
+
"""Generate cache key from query and kwargs."""
|
| 39 |
+
cache_data = {
|
| 40 |
+
"query": query,
|
| 41 |
+
**kwargs
|
| 42 |
+
}
|
| 43 |
+
return hashlib.md5(json.dumps(cache_data, sort_keys=True).encode()).hexdigest()
|
| 44 |
+
|
| 45 |
+
def _get_cached_result(self, cache_key: str) -> Optional[Dict[str, Any]]:
|
| 46 |
+
"""Get result from cache if valid."""
|
| 47 |
+
if cache_key in self.cache:
|
| 48 |
+
result, timestamp = self.cache[cache_key]
|
| 49 |
+
if datetime.now() - timestamp < self.cache_ttl:
|
| 50 |
+
return result
|
| 51 |
+
del self.cache[cache_key]
|
| 52 |
+
return None
|
| 53 |
+
|
| 54 |
+
def _set_cached_result(self, cache_key: str, result: Dict[str, Any]):
|
| 55 |
+
"""Store result in cache."""
|
| 56 |
+
self.cache[cache_key] = (result, datetime.now())
|
| 57 |
|
| 58 |
@retry(stop=stop_after_attempt(3), wait=wait_exponential(multiplier=1, min=4, max=10))
|
| 59 |
async def search_web(self, query: str, max_results: int = 10) -> List[Dict[str, str]]:
|
| 60 |
"""Perform web search using multiple search engines."""
|
| 61 |
results = []
|
| 62 |
|
| 63 |
+
# Respect rate limiting
|
| 64 |
+
time_since_last = datetime.now() - self.last_search_time
|
| 65 |
+
if time_since_last.total_seconds() < self.search_delay:
|
| 66 |
+
await asyncio.sleep(self.search_delay - time_since_last.total_seconds())
|
| 67 |
+
|
| 68 |
# DuckDuckGo Search
|
| 69 |
try:
|
| 70 |
with DDGS() as ddgs:
|
|
|
|
| 80 |
except Exception as e:
|
| 81 |
print(f"Google search error: {e}")
|
| 82 |
|
| 83 |
+
self.last_search_time = datetime.now()
|
| 84 |
return results[:max_results]
|
| 85 |
|
| 86 |
+
def _clean_html(self, html: str) -> str:
|
| 87 |
+
"""Clean HTML content."""
|
| 88 |
+
# Remove script and style elements
|
| 89 |
+
html = re.sub(r'<script[^>]*>.*?</script>', '', html, flags=re.DOTALL)
|
| 90 |
+
html = re.sub(r'<style[^>]*>.*?</style>', '', html, flags=re.DOTALL)
|
| 91 |
+
|
| 92 |
+
# Remove comments
|
| 93 |
+
html = re.sub(r'<!--.*?-->', '', html, flags=re.DOTALL)
|
| 94 |
+
|
| 95 |
+
# Remove remaining tags
|
| 96 |
+
html = re.sub(r'<[^>]+>', ' ', html)
|
| 97 |
+
|
| 98 |
+
# Clean whitespace
|
| 99 |
+
html = re.sub(r'\s+', ' ', html).strip()
|
| 100 |
+
|
| 101 |
+
return html
|
| 102 |
+
|
| 103 |
@retry(stop=stop_after_attempt(3), wait=wait_exponential(multiplier=1, min=4, max=10))
|
| 104 |
async def fetch_content(self, url: str) -> Optional[str]:
|
| 105 |
"""Fetch and extract content from a webpage."""
|
|
|
|
| 110 |
response = requests.get(url, headers=headers, timeout=10)
|
| 111 |
response.raise_for_status()
|
| 112 |
|
| 113 |
+
# Extract main content
|
| 114 |
soup = BeautifulSoup(response.text, "html.parser")
|
| 115 |
|
| 116 |
# Remove unwanted elements
|
| 117 |
+
for element in soup(["script", "style", "nav", "footer", "header", "aside"]):
|
| 118 |
element.decompose()
|
| 119 |
|
| 120 |
+
# Try to find main content
|
| 121 |
+
main_content = None
|
| 122 |
+
|
| 123 |
+
# Look for article tag
|
| 124 |
+
if soup.find("article"):
|
| 125 |
+
main_content = soup.find("article")
|
| 126 |
+
|
| 127 |
+
# Look for main tag
|
| 128 |
+
elif soup.find("main"):
|
| 129 |
+
main_content = soup.find("main")
|
| 130 |
+
|
| 131 |
+
# Look for div with common content class names
|
| 132 |
+
elif soup.find("div", class_=re.compile(r"content|article|post|entry")):
|
| 133 |
+
main_content = soup.find("div", class_=re.compile(r"content|article|post|entry"))
|
| 134 |
+
|
| 135 |
+
# Use body if no main content found
|
| 136 |
+
if not main_content:
|
| 137 |
+
main_content = soup.body
|
| 138 |
+
|
| 139 |
+
# Extract text
|
| 140 |
+
if main_content:
|
| 141 |
+
text = self._clean_html(str(main_content))
|
| 142 |
+
else:
|
| 143 |
+
text = self._clean_html(response.text)
|
| 144 |
+
|
| 145 |
return text
|
| 146 |
except Exception as e:
|
| 147 |
print(f"Error fetching {url}: {e}")
|
| 148 |
return None
|
| 149 |
|
| 150 |
+
def _extract_metadata(self, soup: BeautifulSoup, url: str) -> Dict[str, Any]:
|
| 151 |
+
"""Extract metadata from webpage."""
|
| 152 |
+
metadata = {
|
| 153 |
+
"url": url,
|
| 154 |
+
"domain": urlparse(url).netloc,
|
| 155 |
+
"title": None,
|
| 156 |
+
"description": None,
|
| 157 |
+
"published_date": None,
|
| 158 |
+
"author": None,
|
| 159 |
+
"keywords": None
|
| 160 |
+
}
|
| 161 |
+
|
| 162 |
+
# Extract title
|
| 163 |
+
if soup.title:
|
| 164 |
+
metadata["title"] = soup.title.string
|
| 165 |
+
|
| 166 |
+
# Extract meta tags
|
| 167 |
+
for meta in soup.find_all("meta"):
|
| 168 |
+
name = meta.get("name", "").lower()
|
| 169 |
+
property = meta.get("property", "").lower()
|
| 170 |
+
content = meta.get("content")
|
| 171 |
+
|
| 172 |
+
if name == "description" or property == "og:description":
|
| 173 |
+
metadata["description"] = content
|
| 174 |
+
elif name == "author":
|
| 175 |
+
metadata["author"] = content
|
| 176 |
+
elif name == "keywords":
|
| 177 |
+
metadata["keywords"] = content
|
| 178 |
+
elif name in ["published_time", "article:published_time"]:
|
| 179 |
+
metadata["published_date"] = content
|
| 180 |
+
|
| 181 |
+
return metadata
|
| 182 |
+
|
| 183 |
async def process_search_results(self, query: str) -> Dict[str, Any]:
|
| 184 |
"""Process search results and create a RAG-based answer."""
|
| 185 |
+
cache_key = self._get_cache_key(query)
|
| 186 |
+
cached_result = self._get_cached_result(cache_key)
|
| 187 |
+
if cached_result:
|
| 188 |
+
return cached_result
|
| 189 |
+
|
| 190 |
# Perform web search
|
| 191 |
search_results = await self.search_web(query)
|
| 192 |
|
| 193 |
# Fetch content from search results
|
| 194 |
documents = []
|
| 195 |
+
metadata_list = []
|
| 196 |
+
|
| 197 |
for result in search_results:
|
| 198 |
url = result.get("link")
|
| 199 |
if not url:
|
|
|
|
| 203 |
if content:
|
| 204 |
# Split content into chunks
|
| 205 |
chunks = self.text_splitter.split_text(content)
|
| 206 |
+
|
| 207 |
+
# Store metadata
|
| 208 |
+
metadata = {
|
| 209 |
+
"source": url,
|
| 210 |
+
"title": result.get("title", url),
|
| 211 |
+
**result
|
| 212 |
+
}
|
| 213 |
+
metadata_list.append(metadata)
|
| 214 |
+
|
| 215 |
+
# Create documents
|
| 216 |
for chunk in chunks:
|
| 217 |
doc = Document(
|
| 218 |
page_content=chunk,
|
| 219 |
+
metadata=metadata
|
| 220 |
)
|
| 221 |
documents.append(doc)
|
| 222 |
|
| 223 |
if not documents:
|
| 224 |
return {
|
| 225 |
"answer": "I couldn't find any relevant information.",
|
| 226 |
+
"sources": [],
|
| 227 |
+
"metadata": []
|
| 228 |
}
|
| 229 |
|
| 230 |
# Create vector store
|
|
|
|
| 239 |
# Get relevant documents
|
| 240 |
relevant_docs = chain.retriever.get_relevant_documents(query)
|
| 241 |
|
| 242 |
+
# Extract unique sources and content
|
| 243 |
sources = []
|
| 244 |
content = []
|
| 245 |
+
used_metadata = []
|
|
|
|
|
|
|
|
|
|
| 246 |
|
| 247 |
+
for doc in relevant_docs[:5]: # Limit to top 5 most relevant docs
|
| 248 |
+
source = doc.metadata["source"]
|
| 249 |
+
if source not in sources:
|
| 250 |
+
sources.append(source)
|
| 251 |
+
content.append(doc.page_content)
|
| 252 |
+
|
| 253 |
+
# Find corresponding metadata
|
| 254 |
+
for meta in metadata_list:
|
| 255 |
+
if meta["source"] == source:
|
| 256 |
+
used_metadata.append(meta)
|
| 257 |
+
break
|
| 258 |
+
|
| 259 |
+
result = {
|
| 260 |
"answer": "\n\n".join(content),
|
| 261 |
+
"sources": sources,
|
| 262 |
+
"metadata": used_metadata
|
| 263 |
}
|
| 264 |
+
|
| 265 |
+
# Cache the result
|
| 266 |
+
self._set_cached_result(cache_key, result)
|
| 267 |
+
|
| 268 |
+
return result
|
| 269 |
|
| 270 |
async def search(self, query: str) -> Dict[str, Any]:
|
| 271 |
"""Main search interface."""
|
|
|
|
| 274 |
except Exception as e:
|
| 275 |
return {
|
| 276 |
"answer": f"An error occurred: {str(e)}",
|
| 277 |
+
"sources": [],
|
| 278 |
+
"metadata": []
|
| 279 |
}
|
osint_engine.py
DELETED
|
@@ -1,489 +0,0 @@
|
|
| 1 |
-
import os
|
| 2 |
-
import re
|
| 3 |
-
import json
|
| 4 |
-
import time
|
| 5 |
-
import asyncio
|
| 6 |
-
import aiohttp
|
| 7 |
-
import requests
|
| 8 |
-
import httpx
|
| 9 |
-
from PIL import Image
|
| 10 |
-
from io import BytesIO
|
| 11 |
-
from typing import Dict, List, Any, Union, Optional
|
| 12 |
-
from selenium import webdriver
|
| 13 |
-
from selenium.webdriver.chrome.options import Options
|
| 14 |
-
from selenium.webdriver.chrome.service import Service
|
| 15 |
-
from webdriver_manager.chrome import ChromeDriverManager
|
| 16 |
-
from geopy.geocoders import Nominatim
|
| 17 |
-
from waybackpy import WaybackMachineCDXServerAPI
|
| 18 |
-
import whois
|
| 19 |
-
from datetime import datetime
|
| 20 |
-
from googlesearch import search as google_search
|
| 21 |
-
import base64
|
| 22 |
-
import io
|
| 23 |
-
|
| 24 |
-
class OSINTEngine:
|
| 25 |
-
"""OSINT capabilities for advanced information gathering"""
|
| 26 |
-
|
| 27 |
-
def __init__(self):
|
| 28 |
-
self.chrome_options = Options()
|
| 29 |
-
self.chrome_options.add_argument('--headless')
|
| 30 |
-
self.chrome_options.add_argument('--no-sandbox')
|
| 31 |
-
self.chrome_options.add_argument('--disable-dev-shm-usage')
|
| 32 |
-
self.setup_apis()
|
| 33 |
-
self.session = None
|
| 34 |
-
self.platforms = {
|
| 35 |
-
"twitter": "https://twitter.com/{}",
|
| 36 |
-
"instagram": "https://instagram.com/{}",
|
| 37 |
-
"facebook": "https://facebook.com/{}",
|
| 38 |
-
"linkedin": "https://linkedin.com/in/{}",
|
| 39 |
-
"github": "https://github.com/{}",
|
| 40 |
-
"reddit": "https://reddit.com/user/{}",
|
| 41 |
-
"youtube": "https://youtube.com/@{}",
|
| 42 |
-
"tiktok": "https://tiktok.com/@{}",
|
| 43 |
-
"pinterest": "https://pinterest.com/{}",
|
| 44 |
-
"snapchat": "https://snapchat.com/add/{}",
|
| 45 |
-
"twitch": "https://twitch.tv/{}",
|
| 46 |
-
"medium": "https://medium.com/@{}",
|
| 47 |
-
"devto": "https://dev.to/{}",
|
| 48 |
-
"stackoverflow": "https://stackoverflow.com/users/{}"
|
| 49 |
-
}
|
| 50 |
-
|
| 51 |
-
def setup_apis(self):
|
| 52 |
-
"""Initialize API clients"""
|
| 53 |
-
self.geolocator = Nominatim(user_agent="intelligent_search")
|
| 54 |
-
self.http_client = httpx.AsyncClient()
|
| 55 |
-
|
| 56 |
-
async def initialize(self):
|
| 57 |
-
if not self.session:
|
| 58 |
-
self.session = aiohttp.ClientSession()
|
| 59 |
-
|
| 60 |
-
async def close(self):
|
| 61 |
-
if self.session:
|
| 62 |
-
await self.session.close()
|
| 63 |
-
self.session = None
|
| 64 |
-
|
| 65 |
-
async def search_username(self, username: str) -> Dict[str, Any]:
|
| 66 |
-
"""Search for username across multiple platforms"""
|
| 67 |
-
results = {
|
| 68 |
-
'platforms': [],
|
| 69 |
-
'social_media': {},
|
| 70 |
-
'websites': []
|
| 71 |
-
}
|
| 72 |
-
|
| 73 |
-
# Common social media platforms
|
| 74 |
-
platforms = [
|
| 75 |
-
{'name': 'GitHub', 'url': f'https://github.com/{username}'},
|
| 76 |
-
{'name': 'Twitter', 'url': f'https://twitter.com/{username}'},
|
| 77 |
-
{'name': 'Instagram', 'url': f'https://instagram.com/{username}'},
|
| 78 |
-
{'name': 'LinkedIn', 'url': f'https://linkedin.com/in/{username}'},
|
| 79 |
-
{'name': 'Facebook', 'url': f'https://facebook.com/{username}'},
|
| 80 |
-
{'name': 'YouTube', 'url': f'https://youtube.com/@{username}'},
|
| 81 |
-
]
|
| 82 |
-
|
| 83 |
-
async with aiohttp.ClientSession() as session:
|
| 84 |
-
tasks = []
|
| 85 |
-
for platform in platforms:
|
| 86 |
-
task = self.check_profile(session, platform['url'], platform['name'])
|
| 87 |
-
tasks.append(task)
|
| 88 |
-
|
| 89 |
-
platform_results = await asyncio.gather(*tasks)
|
| 90 |
-
results['platforms'] = [r for r in platform_results if r is not None]
|
| 91 |
-
|
| 92 |
-
# Google search for additional mentions
|
| 93 |
-
try:
|
| 94 |
-
search_query = f'"{username}" OR "@{username}" -site:twitter.com -site:facebook.com -site:instagram.com'
|
| 95 |
-
web_results = list(google_search(search_query, num_results=5))
|
| 96 |
-
results['websites'] = web_results
|
| 97 |
-
except Exception as e:
|
| 98 |
-
results['websites'] = [str(e)]
|
| 99 |
-
|
| 100 |
-
return results
|
| 101 |
-
|
| 102 |
-
async def check_profile(self, session, url: str, platform: str) -> Dict[str, str]:
|
| 103 |
-
"""Check if a profile exists on a platform"""
|
| 104 |
-
try:
|
| 105 |
-
async with session.get(url) as response:
|
| 106 |
-
if response.status == 200:
|
| 107 |
-
return {
|
| 108 |
-
'platform': platform,
|
| 109 |
-
'url': url,
|
| 110 |
-
'exists': True
|
| 111 |
-
}
|
| 112 |
-
except:
|
| 113 |
-
pass
|
| 114 |
-
return None
|
| 115 |
-
|
| 116 |
-
async def check_username(self, username: str, platform: str = "all") -> List[Dict]:
|
| 117 |
-
await self.initialize()
|
| 118 |
-
results = []
|
| 119 |
-
|
| 120 |
-
platforms_to_check = [platform] if platform != "all" else self.platforms.keys()
|
| 121 |
-
|
| 122 |
-
for platform_name in platforms_to_check:
|
| 123 |
-
if platform_name in self.platforms:
|
| 124 |
-
url = self.platforms[platform_name].format(username)
|
| 125 |
-
try:
|
| 126 |
-
async with self.session.get(url) as response:
|
| 127 |
-
exists = response.status == 200
|
| 128 |
-
results.append({
|
| 129 |
-
"platform": platform_name,
|
| 130 |
-
"url": url,
|
| 131 |
-
"exists": exists
|
| 132 |
-
})
|
| 133 |
-
except:
|
| 134 |
-
results.append({
|
| 135 |
-
"platform": platform_name,
|
| 136 |
-
"url": url,
|
| 137 |
-
"exists": False,
|
| 138 |
-
"error": "Connection failed"
|
| 139 |
-
})
|
| 140 |
-
|
| 141 |
-
return results
|
| 142 |
-
|
| 143 |
-
async def search_image(self, image_url: str) -> Dict[str, Any]:
|
| 144 |
-
"""Image analysis and reverse search"""
|
| 145 |
-
results = {
|
| 146 |
-
'analysis': {},
|
| 147 |
-
'similar_images': [],
|
| 148 |
-
'error': None
|
| 149 |
-
}
|
| 150 |
-
|
| 151 |
-
try:
|
| 152 |
-
# Download and analyze image
|
| 153 |
-
response = requests.get(image_url)
|
| 154 |
-
img = Image.open(BytesIO(response.content))
|
| 155 |
-
|
| 156 |
-
# Basic image analysis
|
| 157 |
-
results['analysis'] = {
|
| 158 |
-
'format': img.format,
|
| 159 |
-
'size': img.size,
|
| 160 |
-
'mode': img.mode
|
| 161 |
-
}
|
| 162 |
-
|
| 163 |
-
# Perform reverse image search using Google Lens
|
| 164 |
-
search_url = f"https://lens.google.com/uploadbyurl?url={image_url}"
|
| 165 |
-
results['similar_images'].append({
|
| 166 |
-
'source': 'Google Lens',
|
| 167 |
-
'url': search_url
|
| 168 |
-
})
|
| 169 |
-
|
| 170 |
-
except Exception as e:
|
| 171 |
-
results['error'] = str(e)
|
| 172 |
-
|
| 173 |
-
return results
|
| 174 |
-
|
| 175 |
-
async def gather_personal_info(self, data: Dict[str, str]) -> Dict[str, Any]:
|
| 176 |
-
"""Gather personal information from various sources"""
|
| 177 |
-
results = {}
|
| 178 |
-
|
| 179 |
-
if 'location' in data:
|
| 180 |
-
results['location'] = await self.analyze_location(data['location'])
|
| 181 |
-
|
| 182 |
-
if 'domain' in data:
|
| 183 |
-
results['domain'] = self.analyze_domain(data['domain'])
|
| 184 |
-
|
| 185 |
-
return results
|
| 186 |
-
|
| 187 |
-
async def analyze_location(self, location: str) -> Dict[str, Any]:
|
| 188 |
-
"""Analyze location information"""
|
| 189 |
-
try:
|
| 190 |
-
location_data = self.geolocator.geocode(location)
|
| 191 |
-
if location_data:
|
| 192 |
-
return {
|
| 193 |
-
'address': location_data.address,
|
| 194 |
-
'latitude': location_data.latitude,
|
| 195 |
-
'longitude': location_data.longitude,
|
| 196 |
-
'raw': location_data.raw
|
| 197 |
-
}
|
| 198 |
-
except Exception as e:
|
| 199 |
-
return {'error': str(e)}
|
| 200 |
-
return None
|
| 201 |
-
|
| 202 |
-
def analyze_domain(self, domain: str) -> Dict[str, Any]:
|
| 203 |
-
"""Analyze domain information"""
|
| 204 |
-
try:
|
| 205 |
-
domain_info = whois.whois(domain)
|
| 206 |
-
return {
|
| 207 |
-
'registrar': domain_info.registrar,
|
| 208 |
-
'creation_date': domain_info.creation_date,
|
| 209 |
-
'expiration_date': domain_info.expiration_date,
|
| 210 |
-
'last_updated': domain_info.updated_date,
|
| 211 |
-
'status': domain_info.status
|
| 212 |
-
}
|
| 213 |
-
except Exception as e:
|
| 214 |
-
return {'error': str(e)}
|
| 215 |
-
|
| 216 |
-
async def search_historical_data(self, url: str) -> List[Dict[str, Any]]:
|
| 217 |
-
"""Search for historical data using Wayback Machine"""
|
| 218 |
-
results = []
|
| 219 |
-
|
| 220 |
-
try:
|
| 221 |
-
user_agent = "Mozilla/5.0"
|
| 222 |
-
cdx = WaybackMachineCDXServerAPI(url, user_agent)
|
| 223 |
-
|
| 224 |
-
for snapshot in cdx.snapshots():
|
| 225 |
-
results.append({
|
| 226 |
-
'timestamp': snapshot.timestamp,
|
| 227 |
-
'url': snapshot.archive_url,
|
| 228 |
-
'status': snapshot.status_code,
|
| 229 |
-
'mime_type': snapshot.mime_type
|
| 230 |
-
})
|
| 231 |
-
|
| 232 |
-
except Exception as e:
|
| 233 |
-
results.append({'error': str(e)})
|
| 234 |
-
|
| 235 |
-
return results
|
| 236 |
-
|
| 237 |
-
async def search_person(self, name: str, location: Optional[str] = None) -> List[Dict]:
|
| 238 |
-
await self.initialize()
|
| 239 |
-
results = []
|
| 240 |
-
|
| 241 |
-
# Format search query
|
| 242 |
-
query = f"{name}"
|
| 243 |
-
if location:
|
| 244 |
-
query += f" {location}"
|
| 245 |
-
|
| 246 |
-
# Simulate searching various sources
|
| 247 |
-
sources = ["social_media", "news", "public_records", "professional"]
|
| 248 |
-
|
| 249 |
-
for source in sources:
|
| 250 |
-
# Simulate different data sources
|
| 251 |
-
if source == "social_media":
|
| 252 |
-
profile = {
|
| 253 |
-
"name": name,
|
| 254 |
-
"location": location,
|
| 255 |
-
"source": "Social Media",
|
| 256 |
-
"profile_image": "https://example.com/profile.jpg",
|
| 257 |
-
"social_links": [
|
| 258 |
-
{"platform": "LinkedIn", "url": f"https://linkedin.com/in/{name.lower().replace(' ', '-')}"},
|
| 259 |
-
{"platform": "Twitter", "url": f"https://twitter.com/{name.lower().replace(' ', '')}"}
|
| 260 |
-
],
|
| 261 |
-
"occupation": "Professional",
|
| 262 |
-
"last_seen": datetime.now().strftime("%Y-%m-%d")
|
| 263 |
-
}
|
| 264 |
-
results.append(profile)
|
| 265 |
-
|
| 266 |
-
elif source == "news":
|
| 267 |
-
news = {
|
| 268 |
-
"name": name,
|
| 269 |
-
"source": "News Articles",
|
| 270 |
-
"mentions": [
|
| 271 |
-
{
|
| 272 |
-
"title": f"Article about {name}",
|
| 273 |
-
"url": "https://example.com/news",
|
| 274 |
-
"date": "2023-01-01"
|
| 275 |
-
}
|
| 276 |
-
]
|
| 277 |
-
}
|
| 278 |
-
results.append(news)
|
| 279 |
-
|
| 280 |
-
elif source == "public_records":
|
| 281 |
-
record = {
|
| 282 |
-
"name": name,
|
| 283 |
-
"source": "Public Records",
|
| 284 |
-
"location": location,
|
| 285 |
-
"age_range": "25-35",
|
| 286 |
-
"possible_relatives": ["Jane Doe", "John Doe Sr."],
|
| 287 |
-
"previous_locations": ["New York, NY", "Los Angeles, CA"]
|
| 288 |
-
}
|
| 289 |
-
results.append(record)
|
| 290 |
-
|
| 291 |
-
elif source == "professional":
|
| 292 |
-
prof = {
|
| 293 |
-
"name": name,
|
| 294 |
-
"source": "Professional Records",
|
| 295 |
-
"education": ["University Example"],
|
| 296 |
-
"work_history": ["Company A", "Company B"],
|
| 297 |
-
"skills": ["Leadership", "Management"]
|
| 298 |
-
}
|
| 299 |
-
results.append(prof)
|
| 300 |
-
|
| 301 |
-
return results
|
| 302 |
-
|
| 303 |
-
async def get_person_details(self, person_id: str) -> Dict:
|
| 304 |
-
"""Get detailed information about a specific person"""
|
| 305 |
-
await self.initialize()
|
| 306 |
-
|
| 307 |
-
# Simulate gathering detailed information
|
| 308 |
-
details = {
|
| 309 |
-
"personal": {
|
| 310 |
-
"name": person_id,
|
| 311 |
-
"age_range": "25-35",
|
| 312 |
-
"locations": ["Current City, Country", "Previous City, Country"],
|
| 313 |
-
"education": ["University Name", "High School Name"],
|
| 314 |
-
"occupation": "Current Occupation"
|
| 315 |
-
},
|
| 316 |
-
"social_media": {
|
| 317 |
-
"profiles": [
|
| 318 |
-
{
|
| 319 |
-
"platform": "LinkedIn",
|
| 320 |
-
"url": f"https://linkedin.com/in/{person_id}",
|
| 321 |
-
"last_active": "2023-01-01"
|
| 322 |
-
},
|
| 323 |
-
{
|
| 324 |
-
"platform": "Twitter",
|
| 325 |
-
"url": f"https://twitter.com/{person_id}",
|
| 326 |
-
"last_active": "2023-01-01"
|
| 327 |
-
}
|
| 328 |
-
]
|
| 329 |
-
},
|
| 330 |
-
"contact": {
|
| 331 |
-
"email_pattern": "j***@example.com",
|
| 332 |
-
"phone_pattern": "+1 (***) ***-**89"
|
| 333 |
-
},
|
| 334 |
-
"images": [
|
| 335 |
-
{
|
| 336 |
-
"url": "https://example.com/profile1.jpg",
|
| 337 |
-
"source": "LinkedIn",
|
| 338 |
-
"date": "2023-01-01"
|
| 339 |
-
}
|
| 340 |
-
],
|
| 341 |
-
"activities": {
|
| 342 |
-
"recent_posts": [
|
| 343 |
-
{
|
| 344 |
-
"platform": "Twitter",
|
| 345 |
-
"content": "Example post content",
|
| 346 |
-
"date": "2023-01-01"
|
| 347 |
-
}
|
| 348 |
-
],
|
| 349 |
-
"mentions": [
|
| 350 |
-
{
|
| 351 |
-
"source": "News Article",
|
| 352 |
-
"title": "Article Title",
|
| 353 |
-
"url": "https://example.com/article",
|
| 354 |
-
"date": "2023-01-01"
|
| 355 |
-
}
|
| 356 |
-
]
|
| 357 |
-
}
|
| 358 |
-
}
|
| 359 |
-
|
| 360 |
-
return details
|
| 361 |
-
|
| 362 |
-
async def analyze_image(self, image_path: str) -> Dict:
|
| 363 |
-
"""Analyze an image and return information about it"""
|
| 364 |
-
try:
|
| 365 |
-
# Open and analyze the image
|
| 366 |
-
img = Image.open(image_path if os.path.exists(image_path) else io.BytesIO(requests.get(image_path).content))
|
| 367 |
-
|
| 368 |
-
analysis = {
|
| 369 |
-
"format": img.format,
|
| 370 |
-
"size": f"{img.size[0]}x{img.size[1]}",
|
| 371 |
-
"mode": img.mode,
|
| 372 |
-
"metadata": {},
|
| 373 |
-
}
|
| 374 |
-
|
| 375 |
-
# Extract EXIF data if available
|
| 376 |
-
if hasattr(img, '_getexif') and img._getexif():
|
| 377 |
-
exif = img._getexif()
|
| 378 |
-
if exif:
|
| 379 |
-
analysis["metadata"] = {
|
| 380 |
-
"datetime": exif.get(306, "Unknown"),
|
| 381 |
-
"make": exif.get(271, "Unknown"),
|
| 382 |
-
"model": exif.get(272, "Unknown"),
|
| 383 |
-
"software": exif.get(305, "Unknown")
|
| 384 |
-
}
|
| 385 |
-
|
| 386 |
-
return analysis
|
| 387 |
-
except Exception as e:
|
| 388 |
-
return {"error": str(e)}
|
| 389 |
-
|
| 390 |
-
async def find_similar_images(self, image_url: str) -> List[Dict]:
|
| 391 |
-
"""Find similar images"""
|
| 392 |
-
# Simulate finding similar images
|
| 393 |
-
return [
|
| 394 |
-
{
|
| 395 |
-
"url": "https://example.com/similar1.jpg",
|
| 396 |
-
"similarity": 0.95,
|
| 397 |
-
"source": "Website A"
|
| 398 |
-
},
|
| 399 |
-
{
|
| 400 |
-
"url": "https://example.com/similar2.jpg",
|
| 401 |
-
"similarity": 0.85,
|
| 402 |
-
"source": "Website B"
|
| 403 |
-
}
|
| 404 |
-
]
|
| 405 |
-
|
| 406 |
-
async def get_location_info(self, location: str) -> Dict:
|
| 407 |
-
"""Get information about a location"""
|
| 408 |
-
# Simulate location information retrieval
|
| 409 |
-
return {
|
| 410 |
-
"name": location,
|
| 411 |
-
"coordinates": {"lat": 40.7128, "lng": -74.0060},
|
| 412 |
-
"country": "United States",
|
| 413 |
-
"timezone": "America/New_York",
|
| 414 |
-
"population": "8.4 million",
|
| 415 |
-
"weather": "Sunny, 72°F"
|
| 416 |
-
}
|
| 417 |
-
|
| 418 |
-
async def get_domain_info(self, domain: str) -> Dict:
|
| 419 |
-
"""Get information about a domain"""
|
| 420 |
-
# Simulate domain information retrieval
|
| 421 |
-
return {
|
| 422 |
-
"domain": domain,
|
| 423 |
-
"registrar": "Example Registrar",
|
| 424 |
-
"creation_date": "2020-01-01",
|
| 425 |
-
"expiration_date": "2024-01-01",
|
| 426 |
-
"nameservers": ["ns1.example.com", "ns2.example.com"],
|
| 427 |
-
"ip_address": "192.0.2.1",
|
| 428 |
-
"location": "United States"
|
| 429 |
-
}
|
| 430 |
-
|
| 431 |
-
# Helper function to create document from gathered information
|
| 432 |
-
def create_report(data: Dict[str, Any], template: str = "default") -> str:
|
| 433 |
-
"""Create a formatted report from gathered information"""
|
| 434 |
-
if template == "default":
|
| 435 |
-
report = "# OSINT Investigation Report\n\n"
|
| 436 |
-
report += f"Generated on: {datetime.now().strftime('%Y-%m-%d %H:%M:%S')}\n\n"
|
| 437 |
-
|
| 438 |
-
for section, content in data.items():
|
| 439 |
-
report += f"## {section.title()}\n"
|
| 440 |
-
if isinstance(content, dict):
|
| 441 |
-
for key, value in content.items():
|
| 442 |
-
report += f"* {key}: {value}\n"
|
| 443 |
-
elif isinstance(content, list):
|
| 444 |
-
for item in content:
|
| 445 |
-
if isinstance(item, dict):
|
| 446 |
-
for k, v in item.items():
|
| 447 |
-
report += f"* {k}: {v}\n"
|
| 448 |
-
else:
|
| 449 |
-
report += f"* {item}\n"
|
| 450 |
-
else:
|
| 451 |
-
report += f"{content}\n"
|
| 452 |
-
report += "\n"
|
| 453 |
-
|
| 454 |
-
return report
|
| 455 |
-
else:
|
| 456 |
-
raise ValueError(f"Template '{template}' not found")
|
| 457 |
-
|
| 458 |
-
async def create_report_from_data(data: Dict) -> Dict:
|
| 459 |
-
"""Create a formatted report from the gathered data"""
|
| 460 |
-
engine = OSINTEngine()
|
| 461 |
-
|
| 462 |
-
try:
|
| 463 |
-
report = {}
|
| 464 |
-
|
| 465 |
-
if "username" in data:
|
| 466 |
-
report["platforms"] = await engine.check_username(data["username"], data.get("platform", "all"))
|
| 467 |
-
|
| 468 |
-
if "image_url" in data:
|
| 469 |
-
report["analysis"] = await engine.analyze_image(data["image_url"])
|
| 470 |
-
report["similar_images"] = await engine.find_similar_images(data["image_url"])
|
| 471 |
-
|
| 472 |
-
if "location" in data:
|
| 473 |
-
report["location"] = await engine.get_location_info(data["location"])
|
| 474 |
-
|
| 475 |
-
if "domain" in data:
|
| 476 |
-
report["domain"] = await engine.get_domain_info(data["domain"])
|
| 477 |
-
|
| 478 |
-
if "name" in data:
|
| 479 |
-
report["matches"] = await engine.search_person(data["name"], data.get("location"))
|
| 480 |
-
|
| 481 |
-
if "person_id" in data:
|
| 482 |
-
report["details"] = await engine.get_person_details(data["person_id"])
|
| 483 |
-
|
| 484 |
-
await engine.close()
|
| 485 |
-
return report
|
| 486 |
-
|
| 487 |
-
except Exception as e:
|
| 488 |
-
await engine.close()
|
| 489 |
-
return {"error": str(e)}
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
packages.txt
DELETED
|
@@ -1,25 +0,0 @@
|
|
| 1 |
-
python3-dev
|
| 2 |
-
python3-pip
|
| 3 |
-
build-essential
|
| 4 |
-
gcc
|
| 5 |
-
g++
|
| 6 |
-
git
|
| 7 |
-
cmake
|
| 8 |
-
libgomp1
|
| 9 |
-
libglib2.0-0
|
| 10 |
-
libnss3
|
| 11 |
-
libnspr4
|
| 12 |
-
libatk1.0-0
|
| 13 |
-
libatk-bridge2.0-0
|
| 14 |
-
libcups2
|
| 15 |
-
libdrm2
|
| 16 |
-
libdbus-1-3
|
| 17 |
-
libxkbcommon0
|
| 18 |
-
libxcomposite1
|
| 19 |
-
libxdamage1
|
| 20 |
-
libxfixes3
|
| 21 |
-
libxrandr2
|
| 22 |
-
libgbm1
|
| 23 |
-
libpango-1.0-0
|
| 24 |
-
libcairo2
|
| 25 |
-
libasound2
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
requirements.txt
CHANGED
|
@@ -1,42 +1,39 @@
|
|
| 1 |
-
# Core
|
| 2 |
-
|
| 3 |
-
|
| 4 |
-
|
| 5 |
-
|
| 6 |
-
|
| 7 |
-
|
| 8 |
-
|
| 9 |
-
requests==2.31.0
|
| 10 |
-
aiohttp==3.8.5
|
| 11 |
-
httpx==0.24.1
|
| 12 |
-
beautifulsoup4==4.12.2
|
| 13 |
-
selenium==4.15.2
|
| 14 |
-
webdriver-manager==4.0.1
|
| 15 |
-
googlesearch-python==1.2.3
|
| 16 |
-
duckduckgo-search==3.8.5
|
| 17 |
|
| 18 |
-
#
|
| 19 |
-
|
| 20 |
-
|
| 21 |
-
|
| 22 |
-
|
| 23 |
-
|
|
|
|
|
|
|
| 24 |
|
| 25 |
-
#
|
| 26 |
-
|
|
|
|
|
|
|
| 27 |
|
| 28 |
# OSINT Tools
|
| 29 |
-
|
| 30 |
-
|
| 31 |
-
|
| 32 |
-
|
| 33 |
-
sherlock-project==0.14.3
|
| 34 |
|
| 35 |
-
#
|
| 36 |
-
|
| 37 |
-
|
| 38 |
|
| 39 |
# Utilities
|
| 40 |
-
python-
|
| 41 |
-
|
| 42 |
-
|
|
|
|
|
|
|
|
|
| 1 |
+
# Core Dependencies
|
| 2 |
+
python-dotenv>=1.0.0
|
| 3 |
+
langchain>=0.0.200
|
| 4 |
+
transformers>=4.30.2
|
| 5 |
+
sentence-transformers>=2.2.2
|
| 6 |
+
faiss-cpu>=1.7.4
|
| 7 |
+
torch>=2.0.1 --index-url https://download.pytorch.org/whl/cpu
|
| 8 |
+
accelerate>=0.21.0
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 9 |
|
| 10 |
+
# Web Scraping & Search
|
| 11 |
+
duckduckgo-search>=3.8.3
|
| 12 |
+
beautifulsoup4>=4.12.2
|
| 13 |
+
requests>=2.31.0
|
| 14 |
+
google>=3.0.0
|
| 15 |
+
tenacity>=8.2.2
|
| 16 |
+
aiohttp>=3.8.5
|
| 17 |
+
httpx>=0.24.1
|
| 18 |
|
| 19 |
+
# Image Processing
|
| 20 |
+
Pillow>=10.0.0
|
| 21 |
+
face-recognition>=1.3.0
|
| 22 |
+
opencv-python-headless>=4.8.0
|
| 23 |
|
| 24 |
# OSINT Tools
|
| 25 |
+
holehe>=1.61
|
| 26 |
+
sherlock-project>=0.14.0
|
| 27 |
+
python-whois>=0.8.0
|
| 28 |
+
geopy>=2.3.0
|
|
|
|
| 29 |
|
| 30 |
+
# UI
|
| 31 |
+
gradio>=3.40.1
|
| 32 |
+
markdown>=3.4.3
|
| 33 |
|
| 34 |
# Utilities
|
| 35 |
+
python-dateutil>=2.8.2
|
| 36 |
+
tqdm>=4.65.0
|
| 37 |
+
validators>=0.20.0
|
| 38 |
+
urllib3>=2.0.4
|
| 39 |
+
certifi>=2023.7.22
|
search_engine.py
DELETED
|
@@ -1,219 +0,0 @@
|
|
| 1 |
-
from typing import Dict, List, Any
|
| 2 |
-
import requests
|
| 3 |
-
from bs4 import BeautifulSoup
|
| 4 |
-
from duckduckgo_search import ddg
|
| 5 |
-
from transformers import pipeline
|
| 6 |
-
from langchain.embeddings import HuggingFaceEmbeddings
|
| 7 |
-
import time
|
| 8 |
-
import json
|
| 9 |
-
import os
|
| 10 |
-
from urllib.parse import urlparse
|
| 11 |
-
import asyncio
|
| 12 |
-
|
| 13 |
-
class ModelManager:
|
| 14 |
-
"""Manages AI models for text processing"""
|
| 15 |
-
def __init__(self):
|
| 16 |
-
# Initialize with smaller, CPU-friendly models
|
| 17 |
-
self.summarizer = pipeline(
|
| 18 |
-
"summarization",
|
| 19 |
-
model="facebook/bart-base",
|
| 20 |
-
device=-1 # Use CPU
|
| 21 |
-
)
|
| 22 |
-
self.embeddings = HuggingFaceEmbeddings(
|
| 23 |
-
model_name="sentence-transformers/all-MiniLM-L6-v2"
|
| 24 |
-
)
|
| 25 |
-
|
| 26 |
-
def generate_summary(self, text: str, max_length: int = 150) -> str:
|
| 27 |
-
"""Generate a concise summary of the text"""
|
| 28 |
-
if not text or len(text.split()) < 50:
|
| 29 |
-
return text
|
| 30 |
-
|
| 31 |
-
try:
|
| 32 |
-
summary = self.summarizer(
|
| 33 |
-
text,
|
| 34 |
-
max_length=max_length,
|
| 35 |
-
min_length=30,
|
| 36 |
-
do_sample=False
|
| 37 |
-
)[0]['summary_text']
|
| 38 |
-
return summary
|
| 39 |
-
except Exception as e:
|
| 40 |
-
print(f"Error in summarization: {e}")
|
| 41 |
-
return text[:500] + "..."
|
| 42 |
-
|
| 43 |
-
class ContentProcessor:
|
| 44 |
-
"""Processes and analyzes different types of content"""
|
| 45 |
-
def __init__(self):
|
| 46 |
-
self.model_manager = ModelManager()
|
| 47 |
-
|
| 48 |
-
def process_content(self, content: str) -> Dict[str, Any]:
|
| 49 |
-
"""Process content and generate insights"""
|
| 50 |
-
if not content:
|
| 51 |
-
return {"summary": "", "insights": []}
|
| 52 |
-
|
| 53 |
-
try:
|
| 54 |
-
summary = self.model_manager.generate_summary(content)
|
| 55 |
-
return {
|
| 56 |
-
"summary": summary,
|
| 57 |
-
"insights": [] # Simplified for CPU deployment
|
| 58 |
-
}
|
| 59 |
-
except Exception as e:
|
| 60 |
-
print(f"Error processing content: {e}")
|
| 61 |
-
return {"summary": content[:500] + "...", "insights": []}
|
| 62 |
-
|
| 63 |
-
class OSINTEngine:
|
| 64 |
-
"""Main OSINT engine class"""
|
| 65 |
-
def __init__(self):
|
| 66 |
-
from osint_engine import OSINTEngine as ExternalOSINT
|
| 67 |
-
self.engine = ExternalOSINT()
|
| 68 |
-
|
| 69 |
-
async def search_username(self, query: str) -> Dict[str, Any]:
|
| 70 |
-
"""Search for usernames"""
|
| 71 |
-
return await self.engine.search_username(query)
|
| 72 |
-
|
| 73 |
-
async def search_image(self, query: str) -> Dict[str, Any]:
|
| 74 |
-
"""Search for images"""
|
| 75 |
-
return await self.engine.search_image(query)
|
| 76 |
-
|
| 77 |
-
async def search_social_media(self, query: str, platform: str) -> Dict[str, Any]:
|
| 78 |
-
"""Search for social media profiles"""
|
| 79 |
-
results = await self.engine.search_username(query)
|
| 80 |
-
if platform:
|
| 81 |
-
return {platform: [r for r in results.get('platforms', []) if r['platform'].lower() == platform.lower()]}
|
| 82 |
-
return results
|
| 83 |
-
|
| 84 |
-
async def gather_personal_info(self, kwargs: Dict[str, Any]) -> Dict[str, Any]:
|
| 85 |
-
"""Gather personal information"""
|
| 86 |
-
return await self.engine.gather_personal_info(kwargs)
|
| 87 |
-
|
| 88 |
-
async def search_historical_data(self, query: str) -> Dict[str, Any]:
|
| 89 |
-
"""Search for historical data"""
|
| 90 |
-
return await self.engine.search_historical_data(query)
|
| 91 |
-
|
| 92 |
-
class WebSearchEngine:
|
| 93 |
-
"""Main search engine class"""
|
| 94 |
-
def __init__(self):
|
| 95 |
-
self.processor = ContentProcessor()
|
| 96 |
-
self.session = requests.Session()
|
| 97 |
-
self.request_delay = 1.0
|
| 98 |
-
self.last_request_time = 0
|
| 99 |
-
self.osint_engine = OSINTEngine() # Add OSINT engine
|
| 100 |
-
|
| 101 |
-
def is_valid_url(self, url: str) -> bool:
|
| 102 |
-
"""Check if URL is valid for crawling"""
|
| 103 |
-
try:
|
| 104 |
-
parsed = urlparse(url)
|
| 105 |
-
return bool(parsed.netloc and parsed.scheme in ['http', 'https'])
|
| 106 |
-
except:
|
| 107 |
-
return False
|
| 108 |
-
|
| 109 |
-
def get_metadata(self, soup: BeautifulSoup) -> Dict[str, str]:
|
| 110 |
-
"""Extract metadata from page"""
|
| 111 |
-
metadata = {}
|
| 112 |
-
|
| 113 |
-
# Get title
|
| 114 |
-
title = soup.find('title')
|
| 115 |
-
if title:
|
| 116 |
-
metadata['title'] = title.text.strip()
|
| 117 |
-
|
| 118 |
-
# Get meta description
|
| 119 |
-
desc = soup.find('meta', attrs={'name': 'description'})
|
| 120 |
-
if desc:
|
| 121 |
-
metadata['description'] = desc.get('content', '')
|
| 122 |
-
|
| 123 |
-
# Get publication date
|
| 124 |
-
date = soup.find('meta', attrs={'property': 'article:published_time'})
|
| 125 |
-
if date:
|
| 126 |
-
metadata['published_date'] = date.get('content', '').split('T')[0]
|
| 127 |
-
|
| 128 |
-
return metadata
|
| 129 |
-
|
| 130 |
-
def process_url(self, url: str) -> Dict[str, Any]:
|
| 131 |
-
"""Process a single URL"""
|
| 132 |
-
if not self.is_valid_url(url):
|
| 133 |
-
return None
|
| 134 |
-
|
| 135 |
-
try:
|
| 136 |
-
# Rate limiting
|
| 137 |
-
current_time = time.time()
|
| 138 |
-
if current_time - self.last_request_time < self.request_delay:
|
| 139 |
-
time.sleep(self.request_delay)
|
| 140 |
-
|
| 141 |
-
response = self.session.get(url, timeout=10)
|
| 142 |
-
self.last_request_time = time.time()
|
| 143 |
-
|
| 144 |
-
if response.status_code != 200:
|
| 145 |
-
return None
|
| 146 |
-
|
| 147 |
-
soup = BeautifulSoup(response.text, 'lxml')
|
| 148 |
-
metadata = self.get_metadata(soup)
|
| 149 |
-
|
| 150 |
-
# Extract main content (simplified)
|
| 151 |
-
content = ' '.join([p.text for p in soup.find_all('p')])
|
| 152 |
-
processed = self.processor.process_content(content)
|
| 153 |
-
|
| 154 |
-
return {
|
| 155 |
-
'url': url,
|
| 156 |
-
'title': metadata.get('title', url),
|
| 157 |
-
'summary': processed['summary'],
|
| 158 |
-
'published_date': metadata.get('published_date', '')
|
| 159 |
-
}
|
| 160 |
-
|
| 161 |
-
except Exception as e:
|
| 162 |
-
print(f"Error processing URL {url}: {e}")
|
| 163 |
-
return None
|
| 164 |
-
|
| 165 |
-
def search(self, query: str, max_results: int = 5) -> List[Dict[str, Any]]:
|
| 166 |
-
"""Perform search and process results"""
|
| 167 |
-
try:
|
| 168 |
-
# Perform DuckDuckGo search
|
| 169 |
-
search_results = ddg(query, max_results=max_results)
|
| 170 |
-
|
| 171 |
-
results = []
|
| 172 |
-
for result in search_results:
|
| 173 |
-
processed = self.process_url(result['link'])
|
| 174 |
-
if processed:
|
| 175 |
-
results.append(processed)
|
| 176 |
-
|
| 177 |
-
return results[:max_results]
|
| 178 |
-
|
| 179 |
-
except Exception as e:
|
| 180 |
-
print(f"Error in search: {e}")
|
| 181 |
-
return []
|
| 182 |
-
|
| 183 |
-
async def advanced_search(self, query: str, search_type: str = "web", **kwargs) -> Dict[str, Any]:
|
| 184 |
-
"""Perform advanced search based on type"""
|
| 185 |
-
results = {}
|
| 186 |
-
|
| 187 |
-
try:
|
| 188 |
-
if search_type == "web":
|
| 189 |
-
results["web"] = self.search(query, kwargs.get("max_results", 5))
|
| 190 |
-
elif search_type == "username":
|
| 191 |
-
results["osint"] = await self.osint_engine.search_username(query)
|
| 192 |
-
elif search_type == "image":
|
| 193 |
-
results["image"] = await self.osint_engine.search_image(query)
|
| 194 |
-
elif search_type == "social":
|
| 195 |
-
results["social"] = await self.osint_engine.search_social_media(
|
| 196 |
-
query,
|
| 197 |
-
kwargs.get("platform")
|
| 198 |
-
)
|
| 199 |
-
elif search_type == "personal":
|
| 200 |
-
results["personal"] = await self.osint_engine.gather_personal_info(kwargs)
|
| 201 |
-
elif search_type == "historical":
|
| 202 |
-
results["historical"] = await self.osint_engine.search_historical_data(query)
|
| 203 |
-
|
| 204 |
-
except Exception as e:
|
| 205 |
-
results["error"] = str(e)
|
| 206 |
-
|
| 207 |
-
return results
|
| 208 |
-
|
| 209 |
-
# Main search function
|
| 210 |
-
def search(query: str, max_results: int = 5) -> List[Dict[str, Any]]:
|
| 211 |
-
"""Main search function"""
|
| 212 |
-
engine = WebSearchEngine()
|
| 213 |
-
return engine.search(query, max_results)
|
| 214 |
-
|
| 215 |
-
# Main advanced search function
|
| 216 |
-
async def advanced_search(query: str, search_type: str = "web", **kwargs) -> Dict[str, Any]:
|
| 217 |
-
"""Main advanced search function"""
|
| 218 |
-
engine = WebSearchEngine()
|
| 219 |
-
return await engine.advanced_search(query, search_type, **kwargs)
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
space.yml
DELETED
|
@@ -1,11 +0,0 @@
|
|
| 1 |
-
title: Intelligent Search Engine
|
| 2 |
-
emoji: 🔍
|
| 3 |
-
colorFrom: blue
|
| 4 |
-
colorTo: indigo
|
| 5 |
-
sdk: gradio
|
| 6 |
-
sdk_version: 4.14.0
|
| 7 |
-
python_version: "3.10"
|
| 8 |
-
app_file: app.py
|
| 9 |
-
app_port: 7860
|
| 10 |
-
pinned: false
|
| 11 |
-
license: apache-2.0
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|