Spaces:

rkihacker
/

Scrap

Paused

App Files Files Community

rkihacker commited on Sep 15

Commit

4906187

verified ·

1 Parent(s): bc2abd9

Update main.py

Browse files

Files changed (1) hide show

main.py +63 -54

main.py CHANGED Viewed

@@ -5,6 +5,7 @@ import logging
 import random
 import re
 from typing import AsyncGenerator, Optional, Tuple, List
 from fastapi import FastAPI
 from fastapi.responses import StreamingResponse
@@ -26,34 +27,15 @@ else:
     logger.info("LLM API Key loaded successfully.")
 # --- Constants & Headers ---
-# API Provider Constants
-SNAPZION_API_URL = "https://search.snapzion.com/get-snippets"
 LLM_API_URL = "https://api.typegpt.net/v1/chat/completions"
 LLM_MODEL = "gpt-4.1-mini"
-# Automatic Context Sizing (No more fixed limits)
 TARGET_TOKEN_LIMIT = 28000
 ESTIMATED_CHARS_PER_TOKEN = 4
 MAX_CONTEXT_CHAR_LENGTH = TARGET_TOKEN_LIMIT * ESTIMATED_CHARS_PER_TOKEN
-# ***** THE CRITICAL FIX: Full, legitimate headers for the Snapzion API call *****
-SNAPZION_HEADERS = {
-    'accept': '*/*',
-    'accept-language': 'en-US,en;q=0.9',
-    'content-type': 'application/json',
-    'origin': 'https://search.snapzion.com',
-    'priority': 'u=1, i',
-    'referer': 'https://search.snapzion.com/docs',
-    'sec-ch-ua': '"Chromium";v="140", "Not=A?Brand";v="24", "Google Chrome";v="140"',
-    'sec-ch-ua-mobile': '?0',
-    'sec-ch-ua-platform': '"Windows"',
-    'sec-fetch-dest': 'empty',
-    'sec-fetch-mode': 'cors',
-    'sec-fetch-site': 'same-origin',
-    'user-agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/140.0.0.0 Safari/537.36',
-}
-# Real Browser User Agents for SCRAPING ROTATION
 USER_AGENTS = [
     "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/128.0.0.0 Safari/537.36",
     "Mozilla/5.0 (Macintosh; Intel Mac OS X 10_15_7) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/128.0.0.0 Safari/537.36",
@@ -74,46 +56,74 @@ def extract_json_from_llm_response(text: str) -> Optional[list]:
 app = FastAPI(
     title="AI Deep Research API",
-    description="Provides robust, streaming deep research completions.",
-    version="4.0.0"  # Final Production Version
 )
 # --- Core Service Functions ---
-async def call_snapzion_search(session: aiohttp.ClientSession, query: str) -> List[dict]:
-    logger.info(f"Searching Snapzion for: '{query}'")
     try:
-        async with session.post(SNAPZION_API_URL, headers=SNAPZION_HEADERS, json={"query": query}, timeout=20) as response:
             response.raise_for_status()
-            data = await response.json()
-            results = data.get("organic_results", [])
-            logger.info(f"Found {len(results)} sources for: '{query}'")
             return results
     except Exception as e:
-        logger.error(f"Snapzion search failed for query '{query}': {e}"); return []
-async def scrape_url(session: aiohttp.ClientSession, url: str) -> str:
-    if url.lower().endswith('.pdf'): return "Error: PDF"
-    try:
-        headers = {'User-Agent': random.choice(USER_AGENTS)}
-        async with session.get(url, headers=headers, timeout=10, ssl=False) as response:
-            if response.status != 200: return f"Error: HTTP {response.status}"
-            return await response.text() # Return full HTML for parsing
-    except Exception as e:
-        return f"Error: {e}"
-def parse_html(html: str) -> str:
-    soup = BeautifulSoup(html, "html.parser")
-    for tag in soup(['script', 'style', 'nav', 'footer', 'header', 'aside']): tag.decompose()
-    return " ".join(soup.stripped_strings)
 async def research_and_process_source(session: aiohttp.ClientSession, source: dict) -> Tuple[str, dict]:
-    html_or_error = await scrape_url(session, source['link'])
-    if html_or_error.startswith("Error:"):
-        logger.warning(f"Scraping failed for {source['link']} ({html_or_error}). Falling back to snippet.")
-        return source.get('snippet', ''), source
-    content = parse_html(html_or_error)
-    return content, source
 # --- Streaming Deep Research Logic ---
 async def run_deep_research_stream(query: str) -> AsyncGenerator[str, None]:
@@ -133,20 +143,19 @@ async def run_deep_research_stream(query: str) -> AsyncGenerator[str, None]:
             yield format_sse({"event": "plan", "data": sub_questions})
-            # Step 2: Conduct Research in Parallel
             yield format_sse({"event": "status", "data": f"Searching sources for {len(sub_questions)} topics..."})
-            search_tasks = [call_snapzion_search(session, sq) for sq in sub_questions]
             all_search_results = await asyncio.gather(*search_tasks)
             unique_sources = list({source['link']: source for results in all_search_results for source in results if 'link' in source and 'snippet' in source}.values())
             if not unique_sources:
-                yield format_sse({"event": "error", "data": "All search queries returned zero usable sources. The search provider might be blocking requests or the topic is too obscure."}); return
             yield format_sse({"event": "status", "data": f"Found {len(unique_sources)} unique sources. Processing..."})
             processing_tasks = [research_and_process_source(session, source) for source in unique_sources]
             consolidated_context, all_sources_used = "", []
             successful_scrapes = 0

 import random
 import re
 from typing import AsyncGenerator, Optional, Tuple, List
+from urllib.parse import quote_plus
 from fastapi import FastAPI
 from fastapi.responses import StreamingResponse
     logger.info("LLM API Key loaded successfully.")
 # --- Constants & Headers ---
 LLM_API_URL = "https://api.typegpt.net/v1/chat/completions"
 LLM_MODEL = "gpt-4.1-mini"
+# Automatic Context Sizing
 TARGET_TOKEN_LIMIT = 28000
 ESTIMATED_CHARS_PER_TOKEN = 4
 MAX_CONTEXT_CHAR_LENGTH = TARGET_TOKEN_LIMIT * ESTIMATED_CHARS_PER_TOKEN
+# Real Browser User Agents for Rotation (Used for both search and scraping)
 USER_AGENTS = [
     "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/128.0.0.0 Safari/537.36",
     "Mozilla/5.0 (Macintosh; Intel Mac OS X 10_15_7) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/128.0.0.0 Safari/537.36",
 app = FastAPI(
     title="AI Deep Research API",
+    description="Provides robust, streaming deep research completions using DuckDuckGo Search.",
+    version="5.0.0"  # Final Production Version with new Search Provider
 )
 # --- Core Service Functions ---
+# ***** THE NEW SEARCH FUNCTION *****
+async def call_duckduckgo_search(session: aiohttp.ClientSession, query: str) -> List[dict]:
+    """Performs a search using DuckDuckGo's HTML interface and parses the results."""
+    search_url = f"https://html.duckduckgo.com/html/?q={quote_plus(query)}"
+    logger.info(f"Searching DuckDuckGo for: '{query}'")
+    headers = {'User-Agent': random.choice(USER_AGENTS)}
     try:
+        async with session.get(search_url, headers=headers, timeout=15) as response:
             response.raise_for_status()
+            html = await response.text()
+            soup = BeautifulSoup(html, "html.parser")
+            results = []
+            # Find all result containers, which have a class 'result'
+            for result_container in soup.find_all('div', class_='result'):
+                title_tag = result_container.find('a', class_='result__a')
+                snippet_tag = result_container.find('a', class_='result__snippet')
+                if title_tag and snippet_tag and title_tag.has_attr('href'):
+                    # The link in DDG's HTML version is a redirect, so we need to clean it
+                    raw_link = title_tag['href']
+                    cleaned_link = re.sub(r'/l/\?kh=-1&uddg=', '', raw_link)
+                    results.append({
+                        'title': title_tag.get_text(strip=True),
+                        'link': cleaned_link,
+                        'snippet': snippet_tag.get_text(strip=True)
+                    })
+            logger.info(f"Found {len(results)} sources from DuckDuckGo for: '{query}'")
             return results
     except Exception as e:
+        logger.error(f"DuckDuckGo search failed for query '{query}': {e}"); return []
 async def research_and_process_source(session: aiohttp.ClientSession, source: dict) -> Tuple[str, dict]:
+    """Scrapes a single source and falls back to its snippet if scraping fails."""
+    logger.info(f"Processing source: {source['link']}")
+    headers = {'User-Agent': random.choice(USER_AGENTS)}
+    try:
+        if source['link'].lower().endswith('.pdf'):
+            raise ValueError("PDF content cannot be scraped.")
+        async with session.get(source['link'], headers=headers, timeout=10, ssl=False) as response:
+            if response.status != 200:
+                raise ValueError(f"HTTP status {response.status}")
+            html = await response.text()
+            soup = BeautifulSoup(html, "html.parser")
+            for tag in soup(['script', 'style', 'nav', 'footer', 'header', 'aside']): tag.decompose()
+            content = " ".join(soup.stripped_strings)
+            if not content.strip(): # Check if parsed content is empty
+                raise ValueError("Parsed content is empty.")
+            return content, source
+    except Exception as e:
+        logger.warning(f"Scraping failed for {source['link']} ({e}). Falling back to snippet.")
+        return source.get('snippet', ''), source
 # --- Streaming Deep Research Logic ---
 async def run_deep_research_stream(query: str) -> AsyncGenerator[str, None]:
             yield format_sse({"event": "plan", "data": sub_questions})
+            # Step 2: Conduct Research in Parallel using DuckDuckGo
             yield format_sse({"event": "status", "data": f"Searching sources for {len(sub_questions)} topics..."})
+            search_tasks = [call_duckduckgo_search(session, sq) for sq in sub_questions]
             all_search_results = await asyncio.gather(*search_tasks)
             unique_sources = list({source['link']: source for results in all_search_results for source in results if 'link' in source and 'snippet' in source}.values())
             if not unique_sources:
+                yield format_sse({"event": "error", "data": "All search queries returned zero usable sources from DuckDuckGo."}); return
             yield format_sse({"event": "status", "data": f"Found {len(unique_sources)} unique sources. Processing..."})
             processing_tasks = [research_and_process_source(session, source) for source in unique_sources]
             consolidated_context, all_sources_used = "", []
             successful_scrapes = 0