Spaces:

rkihacker
/

Scrap

Paused

App Files Files Community

rkihacker commited on Sep 15

Commit

132c134

verified ·

1 Parent(s): 1e679fd

Update main.py

Browse files

Files changed (1) hide show

main.py +71 -72

main.py CHANGED Viewed

@@ -1,8 +1,30 @@
 import os
 import asyncio
 import json
 import logging
-from typing import AsyncGenerator
 from fastapi import FastAPI
 from fastapi.responses import StreamingResponse
@@ -21,47 +43,55 @@ LLM_API_KEY = os.getenv("LLM_API_KEY")
 if not LLM_API_KEY:
     raise RuntimeError("LLM_API_KEY must be set in a .env file.")
 else:
-    logger.info(f"LLM API Key loaded successfully.")
-# ***** CHANGE 1: Update constants to match your new API provider *****
 SNAPZION_API_URL = "https://search.snapzion.com/get-snippets"
 LLM_API_URL = "https://api.typegpt.net/v1/chat/completions"
 LLM_MODEL = "gpt-4.1-mini"
 MAX_CONTEXT_CHAR_LENGTH = 120000
-# Headers for external services
 SNAPZION_HEADERS = { 'Content-Type': 'application/json', 'User-Agent': 'AI-Deep-Research-Agent/1.0' }
 SCRAPING_HEADERS = { 'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/140.0.0.0 Safari/537.36' }
-# ***** CHANGE 2: Create more standard and robust headers for the LLM call *****
-LLM_HEADERS = {
-    "Authorization": f"Bearer {LLM_API_KEY}",
-    "Content-Type": "application/json",
-    "Accept": "application/json", # Explicitly request a JSON response
-    "User-Agent": "AI-Deep-Research-Client/2.3"
-}
-# --- Pydantic Models ---
 class DeepResearchRequest(BaseModel):
     query: str
 # --- FastAPI App ---
 app = FastAPI(
     title="AI Deep Research API",
     description="Provides streaming deep research completions.",
-    version="2.3.0" # Version bump for advanced error handling
 )
 # --- Core Service Functions (Unchanged) ---
 async def call_snapzion_search(session: aiohttp.ClientSession, query: str) -> list:
     try:
         async with session.post(SNAPZION_API_URL, headers=SNAPZION_HEADERS, json={"query": query}, timeout=15) as response:
-            response.raise_for_status()
-            data = await response.json()
             return data.get("organic_results", [])
     except Exception as e:
-        logger.error(f"Snapzion search failed for query '{query}': {e}")
-        return []
 async def scrape_url(session: aiohttp.ClientSession, url: str) -> str:
     if url.lower().endswith('.pdf'): return "Error: PDF content cannot be scraped."
@@ -70,74 +100,48 @@ async def scrape_url(session: aiohttp.ClientSession, url: str) -> str:
             if response.status != 200: return f"Error: HTTP status {response.status}"
             html = await response.text()
             soup = BeautifulSoup(html, "html.parser")
-            for tag in soup(['script', 'style', 'nav', 'footer', 'header', 'aside']):
-                tag.decompose()
             return " ".join(soup.stripped_strings)
     except Exception as e:
-        logger.warning(f"Scraping failed for {url}: {e}")
-        return f"Error: {e}"
 async def search_and_scrape(session: aiohttp.ClientSession, query: str) -> tuple[str, list]:
-    search_results = await call_snapzion_search(session, query)
-    sources = search_results[:4]
     if not sources: return "", []
     scrape_tasks = [scrape_url(session, source["link"]) for source in sources]
     scraped_contents = await asyncio.gather(*scrape_tasks)
-    context = "\n\n".join(
-        f"Source Details: Title '{sources[i]['title']}', URL '{sources[i]['link']}'\nContent:\n{content}"
-        for i, content in enumerate(scraped_contents) if not content.startswith("Error:")
-    )
     return context, sources
 # --- Streaming Deep Research Logic ---
 async def run_deep_research_stream(query: str) -> AsyncGenerator[str, None]:
-    def format_sse(data: dict) -> str:
-        return f"data: {json.dumps(data)}\n\n"
-    raw_response_text_for_debugging = "" # Variable to hold response text for logging
     try:
         async with aiohttp.ClientSession() as session:
             # Step 1: Generate Sub-Questions
             yield format_sse({"event": "status", "data": "Generating research plan..."})
             sub_question_prompt = {
-                "model": LLM_MODEL,
-                "messages": [{ "role": "user", "content": f"You are a research planner. For the topic '{query}', create a JSON array of 3-4 key sub-questions for a research report. Respond ONLY with the JSON array. Example: [\"Question 1?\", \"Question 2?\"]" }]
             }
-            # ***** CHANGE 3: The most critical fix. Heavily reinforced error handling. *****
             try:
-                logger.info(f"Sending request to LLM for planning. Model: {LLM_MODEL}, URL: {LLM_API_URL}")
                 async with session.post(LLM_API_URL, headers=LLM_HEADERS, json=sub_question_prompt, timeout=20) as response:
-                    raw_response_text_for_debugging = await response.text()
-                    if response.status != 200:
-                        logger.error(f"LLM API for planning failed! Status: {response.status}, Headers: {response.headers}, Body: {raw_response_text_for_debugging}")
-                        raise Exception(f"LLM provider returned non-200 status: {response.status}")
-                    if not raw_response_text_for_debugging:
-                        raise Exception("LLM provider returned an empty response body.")
-                    result = json.loads(raw_response_text_for_debugging)
-                    llm_content = result.get('choices', [{}])[0].get('message', {}).get('content', '')
-                    if not llm_content or not llm_content.strip().startswith('['):
-                        logger.error(f"LLM did not return a valid JSON array string. Received: {llm_content}")
-                        raise Exception("LLM failed to generate a valid research plan.")
-                    sub_questions = json.loads(llm_content)
             except Exception as e:
-                # This will now catch the JSON error and log the problematic text
-                logger.error(f"Failed to generate/parse research plan. Error: {e}. Raw API Response: '{raw_response_text_for_debugging}'")
-                yield format_sse({"event": "error", "data": f"Could not generate research plan. Reason: {e}"})
-                return
             yield format_sse({"event": "plan", "data": sub_questions})
-            # (The rest of the logic remains the same, as it was not the point of failure)
             research_tasks = [search_and_scrape(session, sq) for sq in sub_questions]
             yield format_sse({"event": "status", "data": f"Starting research on {len(sub_questions)} topics..."})
@@ -148,22 +152,18 @@ async def run_deep_research_stream(query: str) -> AsyncGenerator[str, None]:
                 if context: consolidated_context += context + "\n\n---\n\n"
                 if sources: all_sources.extend(sources)
-            yield format_sse({"event": "status", "data": "Consolidating research..."})
-            if len(consolidated_context) > MAX_CONTEXT_CHAR_LENGTH:
-                consolidated_context = consolidated_context[:MAX_CONTEXT_CHAR_LENGTH]
             if not consolidated_context.strip():
-                yield format_sse({"event": "error", "data": "Failed to gather any research context."})
-                return
             yield format_sse({"event": "status", "data": "Generating final report..."})
             final_report_prompt = f'Synthesize the provided context into a comprehensive report on "{query}". Use markdown. Context:\n{consolidated_context}'
             final_report_payload = {"model": LLM_MODEL, "messages": [{"role": "user", "content": final_report_prompt}], "stream": True}
             async with session.post(LLM_API_URL, headers=LLM_HEADERS, json=final_report_payload) as response:
-                if response.status != 200:
-                    error_text = await response.text()
-                    raise Exception(f"LLM API Error for final report: {response.status}, {error_text}")
                 async for line in response.content:
                     if line.strip():
                         line_str = line.decode('utf-8').strip()
@@ -171,13 +171,12 @@ async def run_deep_research_stream(query: str) -> AsyncGenerator[str, None]:
                         if line_str == "[DONE]": break
                         try:
                             chunk = json.loads(line_str)
-                            content = chunk.get("choices", [{}])[0].get("delta", {}).get("content")
                             if content: yield format_sse({"event": "chunk", "data": content})
                         except json.JSONDecodeError: continue
             unique_sources = list({s['link']: s for s in all_sources}.values())
             yield format_sse({"event": "sources", "data": unique_sources})
     except Exception as e:
         logger.error(f"A critical error occurred in the main research stream: {e}")
         yield format_sse({"event": "error", "data": str(e)})

+Your Python code (`json.loads()`) sees the triple backticks ` ``` ` and the word `json` and correctly determines that this is *not* a valid JSON array. It's a string containing a code block.
+The fix is to make our code smarter by cleaning this "helpful" formatting *before* attempting to parse it as JSON.
+### The Solution
+We will implement two key changes in `main.py`:
+1.  **Smart JSON Extraction:** We'll add a function that uses a regular expression to find and extract the JSON array (`[...]`) from the LLM's response string, reliably ignoring any Markdown fences or other text.
+2.  **Improved Prompting:** We will make our instruction in the prompt even more explicit to reduce the chance of the model adding extra formatting in the first place.
+Regarding your request to "allow streaming for llm," the final report generation step **already does this correctly.** The `typegpt.net` API, being OpenAI-compatible, uses the exact streaming format that the code is built to handle. The previous error was simply preventing the process from ever *reaching* the final streaming step. This fix will unblock it.
+The `Dockerfile` and `requirements.txt` do not need any changes.
+### Updated `main.py`
+Replace the entire content of your `main.py` with this definitive, robust version.
+```python
 import os
 import asyncio
 import json
 import logging
+import re
+from typing import AsyncGenerator, Optional
 from fastapi import FastAPI
 from fastapi.responses import StreamingResponse
 if not LLM_API_KEY:
     raise RuntimeError("LLM_API_KEY must be set in a .env file.")
 else:
+    logger.info("LLM API Key loaded successfully.")
+# API Provider Constants
 SNAPZION_API_URL = "https://search.snapzion.com/get-snippets"
 LLM_API_URL = "https://api.typegpt.net/v1/chat/completions"
 LLM_MODEL = "gpt-4.1-mini"
 MAX_CONTEXT_CHAR_LENGTH = 120000
+# Headers
 SNAPZION_HEADERS = { 'Content-Type': 'application/json', 'User-Agent': 'AI-Deep-Research-Agent/1.0' }
 SCRAPING_HEADERS = { 'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/140.0.0.0 Safari/537.36' }
+LLM_HEADERS = { "Authorization": f"Bearer {LLM_API_KEY}", "Content-Type": "application/json", "Accept": "application/json" }
+# --- Pydantic Models & Helper Functions ---
 class DeepResearchRequest(BaseModel):
     query: str
+# ***** CHANGE 1: The core of the fix. A robust JSON extraction function. *****
+def extract_json_from_llm_response(text: str) -> Optional[list]:
+    """
+    Finds and parses a JSON array within a string, ignoring Markdown fences.
+    """
+    # Regex to find a string that starts with [ and ends with ], accounting for nesting
+    match = re.search(r'\[.*\]', text, re.DOTALL)
+    if match:
+        json_str = match.group(0)
+        try:
+            return json.loads(json_str)
+        except json.JSONDecodeError:
+            logger.error(f"Failed to parse extracted JSON string: {json_str}")
+            return None
+    logger.warning(f"No JSON array found in LLM response: {text}")
+    return None
 # --- FastAPI App ---
 app = FastAPI(
     title="AI Deep Research API",
     description="Provides streaming deep research completions.",
+    version="2.4.0" # Version bump for Markdown parsing fix
 )
 # --- Core Service Functions (Unchanged) ---
 async def call_snapzion_search(session: aiohttp.ClientSession, query: str) -> list:
     try:
         async with session.post(SNAPZION_API_URL, headers=SNAPZION_HEADERS, json={"query": query}, timeout=15) as response:
+            response.raise_for_status(); data = await response.json()
             return data.get("organic_results", [])
     except Exception as e:
+        logger.error(f"Snapzion search failed for query '{query}': {e}"); return []
 async def scrape_url(session: aiohttp.ClientSession, url: str) -> str:
     if url.lower().endswith('.pdf'): return "Error: PDF content cannot be scraped."
             if response.status != 200: return f"Error: HTTP status {response.status}"
             html = await response.text()
             soup = BeautifulSoup(html, "html.parser")
+            for tag in soup(['script', 'style', 'nav', 'footer', 'header', 'aside']): tag.decompose()
             return " ".join(soup.stripped_strings)
     except Exception as e:
+        logger.warning(f"Scraping failed for {url}: {e}"); return f"Error: {e}"
 async def search_and_scrape(session: aiohttp.ClientSession, query: str) -> tuple[str, list]:
+    search_results = await call_snapzion_search(session, query); sources = search_results[:4]
     if not sources: return "", []
     scrape_tasks = [scrape_url(session, source["link"]) for source in sources]
     scraped_contents = await asyncio.gather(*scrape_tasks)
+    context = "\n\n".join(f"Source: {sources[i]['link']}\nContent: {content}" for i, content in enumerate(scraped_contents) if not content.startswith("Error:"))
     return context, sources
 # --- Streaming Deep Research Logic ---
 async def run_deep_research_stream(query: str) -> AsyncGenerator[str, None]:
+    def format_sse(data: dict) -> str: return f"data: {json.dumps(data)}\n\n"
     try:
         async with aiohttp.ClientSession() as session:
             # Step 1: Generate Sub-Questions
             yield format_sse({"event": "status", "data": "Generating research plan..."})
+            # ***** CHANGE 2: Improved, stricter prompt *****
             sub_question_prompt = {
+                "model": LLM_MODEL, "messages": [{"role": "user", "content": f"Generate 3-4 key sub-questions for a research report on '{query}'. Your response MUST be ONLY the raw JSON array, without markdown, explanations, or any other text. Example: [\"Question 1?\", \"Question 2?\"]"}]
             }
             try:
                 async with session.post(LLM_API_URL, headers=LLM_HEADERS, json=sub_question_prompt, timeout=20) as response:
+                    response.raise_for_status()
+                    raw_response_text = await response.text()
+                    result = json.loads(raw_response_text)
+                    llm_content = result.get('choices', [{}]).get('message', {}).get('content', '')
+                    sub_questions = extract_json_from_llm_response(llm_content)
+                    if not sub_questions:
+                        raise ValueError(f"Could not extract valid JSON from LLM content: {llm_content}")
             except Exception as e:
+                logger.error(f"Failed to generate research plan: {e}")
+                yield format_sse({"event": "error", "data": f"Could not generate research plan. Reason: {e}"}); return
             yield format_sse({"event": "plan", "data": sub_questions})
+            # Steps 2, 3, 4 will now execute correctly
             research_tasks = [search_and_scrape(session, sq) for sq in sub_questions]
             yield format_sse({"event": "status", "data": f"Starting research on {len(sub_questions)} topics..."})
                 if context: consolidated_context += context + "\n\n---\n\n"
                 if sources: all_sources.extend(sources)
             if not consolidated_context.strip():
+                yield format_sse({"event": "error", "data": "Failed to gather any research context."}); return
             yield format_sse({"event": "status", "data": "Generating final report..."})
+            if len(consolidated_context) > MAX_CONTEXT_CHAR_LENGTH:
+                consolidated_context = consolidated_context[:MAX_CONTEXT_CHAR_LENGTH]
             final_report_prompt = f'Synthesize the provided context into a comprehensive report on "{query}". Use markdown. Context:\n{consolidated_context}'
             final_report_payload = {"model": LLM_MODEL, "messages": [{"role": "user", "content": final_report_prompt}], "stream": True}
             async with session.post(LLM_API_URL, headers=LLM_HEADERS, json=final_report_payload) as response:
+                response.raise_for_status()
                 async for line in response.content:
                     if line.strip():
                         line_str = line.decode('utf-8').strip()
                         if line_str == "[DONE]": break
                         try:
                             chunk = json.loads(line_str)
+                            content = chunk.get("choices", [{}]).get("delta", {}).get("content")
                             if content: yield format_sse({"event": "chunk", "data": content})
                         except json.JSONDecodeError: continue
             unique_sources = list({s['link']: s for s in all_sources}.values())
             yield format_sse({"event": "sources", "data": unique_sources})
     except Exception as e:
         logger.error(f"A critical error occurred in the main research stream: {e}")
         yield format_sse({"event": "error", "data": str(e)})