Spaces:

rkihacker
/

Scrap

Paused

App Files Files Community

rkihacker commited on Sep 15

Commit

bc2abd9

verified ·

1 Parent(s): 46a015f

Update main.py

Browse files

Files changed (1) hide show

main.py +65 -57

main.py CHANGED Viewed

@@ -31,74 +31,89 @@ SNAPZION_API_URL = "https://search.snapzion.com/get-snippets"
 LLM_API_URL = "https://api.typegpt.net/v1/chat/completions"
 LLM_MODEL = "gpt-4.1-mini"
-# Automatic Context Sizing based on Tokens
-TARGET_TOKEN_LIMIT = 28000  # Safe limit for models with ~32k context windows
 ESTIMATED_CHARS_PER_TOKEN = 4
 MAX_CONTEXT_CHAR_LENGTH = TARGET_TOKEN_LIMIT * ESTIMATED_CHARS_PER_TOKEN
-# Real Browser User Agents for Rotation
 USER_AGENTS = [
     "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/128.0.0.0 Safari/537.36",
     "Mozilla/5.0 (Macintosh; Intel Mac OS X 10_15_7) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/128.0.0.0 Safari/537.36",
-    "Mozilla/5.0 (Windows NT 10.0; Win64; x64; rv:129.0) Gecko/20100101 Firefox/129.0",
-    "Mozilla/5.0 (X11; Linux x86_64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/128.0.0.0 Safari/537.36",
-    "Mozilla/5.0 (iPhone; CPU iPhone OS 17_5_1 like Mac OS X) AppleWebKit/605.1.15 (KHTML, like Gecko) Version/17.5 Mobile/15E148 Safari/604.1"
 ]
-# Headers
-SNAPZION_HEADERS = {'Content-Type': 'application/json'}
 LLM_HEADERS = {"Authorization": f"Bearer {LLM_API_KEY}", "Content-Type": "application/json", "Accept": "application/json"}
-# --- Pydantic Models & Helper Functions ---
 class DeepResearchRequest(BaseModel):
     query: str
 def extract_json_from_llm_response(text: str) -> Optional[list]:
     match = re.search(r'\[.*\]', text, re.DOTALL)
     if match:
-        json_str = match.group(0)
-        try: return json.loads(json_str)
         except json.JSONDecodeError: return None
     return None
-# --- FastAPI App ---
 app = FastAPI(
     title="AI Deep Research API",
     description="Provides robust, streaming deep research completions.",
-    version="3.0.0"  # Major version bump for robustness overhaul
 )
 # --- Core Service Functions ---
 async def call_snapzion_search(session: aiohttp.ClientSession, query: str) -> List[dict]:
     try:
-        async with session.post(SNAPZION_API_URL, headers=SNAPZION_HEADERS, json={"query": query}, timeout=15) as response:
-            response.raise_for_status(); data = await response.json()
-            return data.get("organic_results", [])
     except Exception as e:
         logger.error(f"Snapzion search failed for query '{query}': {e}"); return []
 async def scrape_url(session: aiohttp.ClientSession, url: str) -> str:
-    if url.lower().endswith('.pdf'): return "Error: PDF content cannot be scraped."
     try:
-        # Rotate user agents for each request
         headers = {'User-Agent': random.choice(USER_AGENTS)}
         async with session.get(url, headers=headers, timeout=10, ssl=False) as response:
-            if response.status != 200: return f"Error: HTTP status {response.status}"
-            html = await response.text()
-            soup = BeautifulSoup(html, "html.parser")
-            for tag in soup(['script', 'style', 'nav', 'footer', 'header', 'aside']): tag.decompose()
-            return " ".join(soup.stripped_strings)
     except Exception as e:
-        logger.warning(f"Scraping failed for {url}: {e}"); return f"Error: {e}"
 async def research_and_process_source(session: aiohttp.ClientSession, source: dict) -> Tuple[str, dict]:
-    """Scrapes a single source and falls back to its snippet if scraping fails."""
-    scraped_content = await scrape_url(session, source['link'])
-    if scraped_content.startswith("Error:"):
-        # SNIPPET FALLBACK LOGIC
-        logger.warning(f"Scraping failed for {source['link']}. Falling back to snippet.")
-        return source['snippet'], source
-    return scraped_content, source
 # --- Streaming Deep Research Logic ---
 async def run_deep_research_stream(query: str) -> AsyncGenerator[str, None]:
@@ -107,36 +122,32 @@ async def run_deep_research_stream(query: str) -> AsyncGenerator[str, None]:
         async with aiohttp.ClientSession() as session:
             # Step 1: Generate Research Plan
             yield format_sse({"event": "status", "data": "Generating research plan..."})
-            plan_prompt = {"model": LLM_MODEL, "messages": [{"role": "user", "content": f"Generate 3-4 key sub-questions for a research report on '{query}'. Your response MUST be ONLY the raw JSON array, without markdown. Example: [\"Question 1?\"]"}]}
             try:
-                async with session.post(LLM_API_URL, headers=LLM_HEADERS, json=plan_prompt, timeout=20) as response:
                     response.raise_for_status(); result = await response.json()
                     sub_questions = result if isinstance(result, list) else extract_json_from_llm_response(result['choices'][0]['message']['content'])
-                    if not isinstance(sub_questions, list): raise ValueError(f"Could not extract a valid list from LLM response: {result}")
             except Exception as e:
-                logger.error(f"Failed to generate research plan: {e}")
                 yield format_sse({"event": "error", "data": f"Could not generate research plan. Reason: {e}"}); return
             yield format_sse({"event": "plan", "data": sub_questions})
             # Step 2: Conduct Research in Parallel
-            yield format_sse({"event": "status", "data": f"Searching for sources for {len(sub_questions)} topics..."})
             search_tasks = [call_snapzion_search(session, sq) for sq in sub_questions]
             all_search_results = await asyncio.gather(*search_tasks)
-            # Deduplicate sources by link to avoid scraping the same page multiple times
-            unique_sources = list({source['link']: source for results in all_search_results for source in results}.values())
             if not unique_sources:
-                yield format_sse({"event": "error", "data": "Search did not return any usable sources."}); return
-            yield format_sse({"event": "status", "data": f"Found {len(unique_sources)} unique sources. Scraping and processing..."})
-            # Process all unique sources concurrently with snippet fallback
             processing_tasks = [research_and_process_source(session, source) for source in unique_sources]
-            consolidated_context = ""
-            all_sources_used = []
             successful_scrapes = 0
             for task in asyncio.as_completed(processing_tasks):
@@ -144,10 +155,9 @@ async def run_deep_research_stream(query: str) -> AsyncGenerator[str, None]:
                 if content:
                     consolidated_context += f"Source: {source_info['link']}\nContent: {content}\n\n---\n\n"
                     all_sources_used.append(source_info)
-                    if not content == source_info['snippet']: # Count as success only if not a snippet
-                        successful_scrapes += 1
-            logger.info(f"Context gathering complete. Successfully scraped {successful_scrapes}/{len(unique_sources)} pages. Used {len(all_sources_used)} total sources (including snippets).")
             if not consolidated_context.strip():
                 yield format_sse({"event": "error", "data": "Failed to gather any research context from scraping or snippets."}); return
@@ -155,7 +165,6 @@ async def run_deep_research_stream(query: str) -> AsyncGenerator[str, None]:
             # Step 3: Synthesize Final Report
             yield format_sse({"event": "status", "data": "Synthesizing final report..."})
             if len(consolidated_context) > MAX_CONTEXT_CHAR_LENGTH:
-                logger.warning(f"Context truncated from {len(consolidated_context)} to {MAX_CONTEXT_CHAR_LENGTH} chars.")
                 consolidated_context = consolidated_context[:MAX_CONTEXT_CHAR_LENGTH]
             report_prompt = f'Synthesize the provided context into a comprehensive, well-structured report on "{query}". Use markdown. Context:\n{consolidated_context}'
@@ -164,19 +173,18 @@ async def run_deep_research_stream(query: str) -> AsyncGenerator[str, None]:
             async with session.post(LLM_API_URL, headers=LLM_HEADERS, json=report_payload) as response:
                 response.raise_for_status()
                 async for line in response.content:
-                    if line.strip():
-                        line_str = line.decode('utf-8').strip()
-                        if line_str.startswith('data:'): line_str = line_str[5:].strip()
-                        if line_str == "[DONE]": break
-                        try:
-                            chunk = json.loads(line_str)
-                            content = chunk.get("choices", [{}])[0].get("delta", {}).get("content")
-                            if content: yield format_sse({"event": "chunk", "data": content})
-                        except json.JSONDecodeError: continue
             yield format_sse({"event": "sources", "data": all_sources_used})
     except Exception as e:
-        logger.error(f"A critical error occurred in the main research stream: {e}", exc_info=True)
         yield format_sse({"event": "error", "data": str(e)})
     finally:
         yield format_sse({"event": "done", "data": "Deep research complete."})

 LLM_API_URL = "https://api.typegpt.net/v1/chat/completions"
 LLM_MODEL = "gpt-4.1-mini"
+# Automatic Context Sizing (No more fixed limits)
+TARGET_TOKEN_LIMIT = 28000
 ESTIMATED_CHARS_PER_TOKEN = 4
 MAX_CONTEXT_CHAR_LENGTH = TARGET_TOKEN_LIMIT * ESTIMATED_CHARS_PER_TOKEN
+# ***** THE CRITICAL FIX: Full, legitimate headers for the Snapzion API call *****
+SNAPZION_HEADERS = {
+    'accept': '*/*',
+    'accept-language': 'en-US,en;q=0.9',
+    'content-type': 'application/json',
+    'origin': 'https://search.snapzion.com',
+    'priority': 'u=1, i',
+    'referer': 'https://search.snapzion.com/docs',
+    'sec-ch-ua': '"Chromium";v="140", "Not=A?Brand";v="24", "Google Chrome";v="140"',
+    'sec-ch-ua-mobile': '?0',
+    'sec-ch-ua-platform': '"Windows"',
+    'sec-fetch-dest': 'empty',
+    'sec-fetch-mode': 'cors',
+    'sec-fetch-site': 'same-origin',
+    'user-agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/140.0.0.0 Safari/537.36',
+}
+# Real Browser User Agents for SCRAPING ROTATION
 USER_AGENTS = [
     "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/128.0.0.0 Safari/537.36",
     "Mozilla/5.0 (Macintosh; Intel Mac OS X 10_15_7) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/128.0.0.0 Safari/537.36",
+    "Mozilla/5.0 (Windows NT 10.0; Win64; x64; rv:129.0) Gecko/20100101 Firefox/129.0"
 ]
 LLM_HEADERS = {"Authorization": f"Bearer {LLM_API_KEY}", "Content-Type": "application/json", "Accept": "application/json"}
 class DeepResearchRequest(BaseModel):
     query: str
 def extract_json_from_llm_response(text: str) -> Optional[list]:
     match = re.search(r'\[.*\]', text, re.DOTALL)
     if match:
+        try: return json.loads(match.group(0))
         except json.JSONDecodeError: return None
     return None
 app = FastAPI(
     title="AI Deep Research API",
     description="Provides robust, streaming deep research completions.",
+    version="4.0.0"  # Final Production Version
 )
 # --- Core Service Functions ---
 async def call_snapzion_search(session: aiohttp.ClientSession, query: str) -> List[dict]:
+    logger.info(f"Searching Snapzion for: '{query}'")
     try:
+        async with session.post(SNAPZION_API_URL, headers=SNAPZION_HEADERS, json={"query": query}, timeout=20) as response:
+            response.raise_for_status()
+            data = await response.json()
+            results = data.get("organic_results", [])
+            logger.info(f"Found {len(results)} sources for: '{query}'")
+            return results
     except Exception as e:
         logger.error(f"Snapzion search failed for query '{query}': {e}"); return []
 async def scrape_url(session: aiohttp.ClientSession, url: str) -> str:
+    if url.lower().endswith('.pdf'): return "Error: PDF"
     try:
         headers = {'User-Agent': random.choice(USER_AGENTS)}
         async with session.get(url, headers=headers, timeout=10, ssl=False) as response:
+            if response.status != 200: return f"Error: HTTP {response.status}"
+            return await response.text() # Return full HTML for parsing
     except Exception as e:
+        return f"Error: {e}"
+def parse_html(html: str) -> str:
+    soup = BeautifulSoup(html, "html.parser")
+    for tag in soup(['script', 'style', 'nav', 'footer', 'header', 'aside']): tag.decompose()
+    return " ".join(soup.stripped_strings)
 async def research_and_process_source(session: aiohttp.ClientSession, source: dict) -> Tuple[str, dict]:
+    html_or_error = await scrape_url(session, source['link'])
+    if html_or_error.startswith("Error:"):
+        logger.warning(f"Scraping failed for {source['link']} ({html_or_error}). Falling back to snippet.")
+        return source.get('snippet', ''), source
+    content = parse_html(html_or_error)
+    return content, source
 # --- Streaming Deep Research Logic ---
 async def run_deep_research_stream(query: str) -> AsyncGenerator[str, None]:
         async with aiohttp.ClientSession() as session:
             # Step 1: Generate Research Plan
             yield format_sse({"event": "status", "data": "Generating research plan..."})
+            plan_prompt = {"model": LLM_MODEL, "messages": [{"role": "user", "content": f"Generate 3-4 key sub-questions for a research report on '{query}'. Your response MUST be ONLY the raw JSON array. Example: [\"Question 1?\"]"}]}
             try:
+                async with session.post(LLM_API_URL, headers=LLM_HEADERS, json=plan_prompt, timeout=25) as response:
                     response.raise_for_status(); result = await response.json()
                     sub_questions = result if isinstance(result, list) else extract_json_from_llm_response(result['choices'][0]['message']['content'])
+                    if not isinstance(sub_questions, list): raise ValueError(f"Invalid plan from LLM: {result}")
             except Exception as e:
                 yield format_sse({"event": "error", "data": f"Could not generate research plan. Reason: {e}"}); return
             yield format_sse({"event": "plan", "data": sub_questions})
             # Step 2: Conduct Research in Parallel
+            yield format_sse({"event": "status", "data": f"Searching sources for {len(sub_questions)} topics..."})
             search_tasks = [call_snapzion_search(session, sq) for sq in sub_questions]
             all_search_results = await asyncio.gather(*search_tasks)
+            unique_sources = list({source['link']: source for results in all_search_results for source in results if 'link' in source and 'snippet' in source}.values())
             if not unique_sources:
+                yield format_sse({"event": "error", "data": "All search queries returned zero usable sources. The search provider might be blocking requests or the topic is too obscure."}); return
+            yield format_sse({"event": "status", "data": f"Found {len(unique_sources)} unique sources. Processing..."})
             processing_tasks = [research_and_process_source(session, source) for source in unique_sources]
+            consolidated_context, all_sources_used = "", []
             successful_scrapes = 0
             for task in asyncio.as_completed(processing_tasks):
                 if content:
                     consolidated_context += f"Source: {source_info['link']}\nContent: {content}\n\n---\n\n"
                     all_sources_used.append(source_info)
+                    if not content == source_info.get('snippet'): successful_scrapes += 1
+            logger.info(f"Context complete. Scraped {successful_scrapes}/{len(unique_sources)} pages. Used {len(all_sources_used)} total sources (with snippet fallbacks).")
             if not consolidated_context.strip():
                 yield format_sse({"event": "error", "data": "Failed to gather any research context from scraping or snippets."}); return
             # Step 3: Synthesize Final Report
             yield format_sse({"event": "status", "data": "Synthesizing final report..."})
             if len(consolidated_context) > MAX_CONTEXT_CHAR_LENGTH:
                 consolidated_context = consolidated_context[:MAX_CONTEXT_CHAR_LENGTH]
             report_prompt = f'Synthesize the provided context into a comprehensive, well-structured report on "{query}". Use markdown. Context:\n{consolidated_context}'
             async with session.post(LLM_API_URL, headers=LLM_HEADERS, json=report_payload) as response:
                 response.raise_for_status()
                 async for line in response.content:
+                    line_str = line.decode('utf-8').strip()
+                    if line_str.startswith('data:'): line_str = line_str[5:].strip()
+                    if line_str == "[DONE]": break
+                    try:
+                        chunk = json.loads(line_str)
+                        content = chunk.get("choices", [{}])[0].get("delta", {}).get("content")
+                        if content: yield format_sse({"event": "chunk", "data": content})
+                    except json.JSONDecodeError: continue
             yield format_sse({"event": "sources", "data": all_sources_used})
     except Exception as e:
+        logger.error(f"A critical error occurred: {e}", exc_info=True)
         yield format_sse({"event": "error", "data": str(e)})
     finally:
         yield format_sse({"event": "done", "data": "Deep research complete."})