Spaces:

rkihacker
/

Scrap

Paused

App Files Files Community

rkihacker commited on Sep 17

Commit

a38a28a

verified ·

1 Parent(s): 5366574

Update main.py

Browse files

Files changed (1) hide show

main.py +82 -35

main.py CHANGED Viewed

@@ -1,3 +1,4 @@
 import os
 import asyncio
 import json
@@ -51,14 +52,22 @@ app = FastAPI(
 )
 # Enable CORS for all origins
-app.add_middleware(CORSMiddleware, allow_origins=["*"], allow_credentials=True, allow_methods=["*"], allow_headers=["*"])
 # --- Helper Functions ---
 def extract_json_from_llm_response(text: str) -> Optional[list]:
     match = re.search(r'\[.*\]', text, re.DOTALL)
     if match:
-        try: return json.loads(match.group(0))
-        except json.JSONDecodeError: return None
     return None
 # --- Core Service Functions ---
@@ -66,10 +75,9 @@ async def call_duckduckgo_search(session: aiohttp.ClientSession, query: str, max
     """Performs a search using the DDGS API with an existing aiohttp session."""
     logger.info(f"Searching DuckDuckGo API for: '{query}'")
     try:
-        # Initialize DDGS with the provided session, no 'async with' needed here
         ddgs = DDGS(session=session)
         raw_results = [r async for r in ddgs.atext(query, max_results=max_results)]
         results = [
             {'title': r.get('title'), 'link': r.get('href'), 'snippet': r.get('body')}
             for r in raw_results if r.get('href') and r.get('title') and r.get('body')
@@ -77,7 +85,6 @@ async def call_duckduckgo_search(session: aiohttp.ClientSession, query: str, max
         logger.info(f"Found {len(results)} sources from DuckDuckGo for: '{query}'")
         return results
     except Exception as e:
-        # Log the full traceback for detailed debugging
         logger.error(f"DuckDuckGo search failed for query '{query}': {e}", exc_info=True)
         return []
@@ -85,52 +92,83 @@ async def research_and_process_source(session: aiohttp.ClientSession, source: di
     headers = {'User-Agent': random.choice(USER_AGENTS)}
     try:
         logger.info(f"Scraping: {source['link']}")
-        if source['link'].lower().endswith('.pdf'): raise ValueError("PDF content")
         async with session.get(source['link'], headers=headers, timeout=10, ssl=False) as response:
-            if response.status != 200: raise ValueError(f"HTTP status {response.status}")
             html = await response.text()
             soup = BeautifulSoup(html, "html.parser")
-            for tag in soup(['script', 'style', 'nav', 'footer', 'header', 'aside']): tag.decompose()
             content = " ".join(soup.stripped_strings)
-            if not content.strip(): raise ValueError("Parsed content is empty.")
             return content, source
     except Exception as e:
         logger.warning(f"Scraping failed for {source['link']} ({e}). Falling back to snippet.")
         return source.get('snippet', ''), source
 # --- Streaming Deep Research Logic ---
 async def run_deep_research_stream(query: str) -> AsyncGenerator[str, None]:
-    def format_sse(data: dict) -> str: return f"data: {json.dumps(data)}\n\n"
     try:
         # Create a single session for all HTTP requests in this stream
         async with aiohttp.ClientSession() as session:
             yield format_sse({"event": "status", "data": "Generating research plan..."})
-            plan_prompt = {"model": LLM_MODEL, "messages": [{"role": "user", "content": f"Generate 3-4 key sub-questions for a research report on '{query}'. Your response MUST be ONLY the raw JSON array. Example: [\"Question 1?\"]"}]}
             try:
                 async with session.post(LLM_API_URL, headers=LLM_HEADERS, json=plan_prompt, timeout=25) as response:
-                    response.raise_for_status(); result = await response.json()
                     sub_questions = result if isinstance(result, list) else extract_json_from_llm_response(result['choices'][0]['message']['content'])
-                    if not isinstance(sub_questions, list): raise ValueError(f"Invalid plan from LLM: {result}")
             except Exception as e:
-                yield format_sse({"event": "error", "data": f"Could not generate research plan. Reason: {e}"}); return
             yield format_sse({"event": "plan", "data": sub_questions})
             yield format_sse({"event": "status", "data": f"Searching sources for {len(sub_questions)} topics..."})
             # Pass the single session to each search task
             search_tasks = [call_duckduckgo_search(session, sq) for sq in sub_questions]
             all_search_results = await asyncio.gather(*search_tasks)
             unique_sources = list({source['link']: source for results in all_search_results for source in results}.values())
             if not unique_sources:
-                yield format_sse({"event": "error", "data": "All search queries returned zero usable sources."}); return
             sources_to_process = unique_sources[:MAX_SOURCES_TO_PROCESS]
-            yield format_sse({"event": "status", "data": f"Found {len(unique_sources)} unique sources. Processing the top {len(sources_to_process)}..."})
             processing_tasks = [research_and_process_source(session, source) for source in sources_to_process]
-            consolidated_context, all_sources_used = "", []
             for task in asyncio.as_completed(processing_tasks):
                 content, source_info = await task
                 if content:
@@ -138,34 +176,43 @@ async def run_deep_research_stream(query: str) -> AsyncGenerator[str, None]:
                     all_sources_used.append(source_info)
             if not consolidated_context.strip():
-                yield format_sse({"event": "error", "data": "Failed to gather any research context."}); return
             yield format_sse({"event": "status", "data": "Synthesizing final report..."})
             report_prompt = f'Synthesize the provided context into a long-form, comprehensive, multi-page report on "{query}". Use markdown. Elaborate extensively on each point. Base your entire report ONLY on the provided context.\n\n## Research Context ##\n{consolidated_context}'
-            report_payload = {"model": LLM_MODEL, "messages": [{"role": "user", "content": report_prompt}], "stream": True}
             async with session.post(LLM_API_URL, headers=LLM_HEADERS, json=report_payload) as response:
                 response.raise_for_status()
                 async for line in response.content:
                     line_str = line.decode('utf-8').strip()
-                    if line_str.startswith('data:'): line_str = line_str[5:].strip()
-                    if line_str == "[DONE]": break
                     try:
                         chunk = json.loads(line_str)
                         choices = chunk.get("choices")
                         if choices and isinstance(choices, list) and len(choices) > 0:
                             content = choices[0].get("delta", {}).get("content")
                             if content:
                                 yield format_sse({"event": "chunk", "data": content})
-                    except json.JSONDecodeError: continue
             yield format_sse({"event": "sources", "data": all_sources_used})
     except Exception as e:
         logger.error(f"A critical error occurred: {e}", exc_info=True)
-        yield format_sse({"event": "error", "data": str(e)})
-    finally:
-        yield format_sse({"event": "done", "data": "Deep research complete."})
-@app.post("/v1/deepresearch/completions")
-async def deep_research_endpoint(request: DeepResearchRequest):
-    return StreamingResponse(run_deep_research_stream(request.query), media_type="text/event-stream")

 import os
 import asyncio
 import json
 )
 # Enable CORS for all origins
+app.add_middleware(
+    CORSMiddleware,
+    allow_origins=["*"],
+    allow_credentials=True,
+    allow_methods=["*"],
+    allow_headers=["*"]
+)
 # --- Helper Functions ---
 def extract_json_from_llm_response(text: str) -> Optional[list]:
     match = re.search(r'\[.*\]', text, re.DOTALL)
     if match:
+        try:
+            return json.loads(match.group(0))
+        except json.JSONDecodeError:
+            return None
     return None
 # --- Core Service Functions ---
     """Performs a search using the DDGS API with an existing aiohttp session."""
     logger.info(f"Searching DuckDuckGo API for: '{query}'")
     try:
         ddgs = DDGS(session=session)
         raw_results = [r async for r in ddgs.atext(query, max_results=max_results)]
         results = [
             {'title': r.get('title'), 'link': r.get('href'), 'snippet': r.get('body')}
             for r in raw_results if r.get('href') and r.get('title') and r.get('body')
         logger.info(f"Found {len(results)} sources from DuckDuckGo for: '{query}'")
         return results
     except Exception as e:
         logger.error(f"DuckDuckGo search failed for query '{query}': {e}", exc_info=True)
         return []
     headers = {'User-Agent': random.choice(USER_AGENTS)}
     try:
         logger.info(f"Scraping: {source['link']}")
+        if source['link'].lower().endswith('.pdf'):
+            raise ValueError("PDF content")
         async with session.get(source['link'], headers=headers, timeout=10, ssl=False) as response:
+            if response.status != 200:
+                raise ValueError(f"HTTP status {response.status}")
             html = await response.text()
             soup = BeautifulSoup(html, "html.parser")
+            # Remove unnecessary tags
+            for tag in soup(['script', 'style', 'nav', 'footer', 'header', 'aside']):
+                tag.decompose()
             content = " ".join(soup.stripped_strings)
+            if not content.strip():
+                raise ValueError("Parsed content is empty.")
             return content, source
     except Exception as e:
         logger.warning(f"Scraping failed for {source['link']} ({e}). Falling back to snippet.")
         return source.get('snippet', ''), source
 # --- Streaming Deep Research Logic ---
 async def run_deep_research_stream(query: str) -> AsyncGenerator[str, None]:
+    def format_sse(data: dict) -> str:
+        return f"data: {json.dumps(data)}\n\n"
     try:
         # Create a single session for all HTTP requests in this stream
         async with aiohttp.ClientSession() as session:
             yield format_sse({"event": "status", "data": "Generating research plan..."})
+            plan_prompt = {
+                "model": LLM_MODEL,
+                "messages": [{
+                    "role": "user",
+                    "content": f"Generate 3-4 key sub-questions for a research report on '{query}'. Your response MUST be ONLY the raw JSON array. Example: [\"Question 1?\"]"
+                }]
+            }
             try:
                 async with session.post(LLM_API_URL, headers=LLM_HEADERS, json=plan_prompt, timeout=25) as response:
+                    response.raise_for_status()
+                    result = await response.json()
                     sub_questions = result if isinstance(result, list) else extract_json_from_llm_response(result['choices'][0]['message']['content'])
+                    if not isinstance(sub_questions, list):
+                        raise ValueError(f"Invalid plan from LLM: {result}")
             except Exception as e:
+                yield format_sse({"event": "error", "data": f"Could not generate research plan. Reason: {e}"})
+                return
             yield format_sse({"event": "plan", "data": sub_questions})
             yield format_sse({"event": "status", "data": f"Searching sources for {len(sub_questions)} topics..."})
             # Pass the single session to each search task
             search_tasks = [call_duckduckgo_search(session, sq) for sq in sub_questions]
             all_search_results = await asyncio.gather(*search_tasks)
+            # Flatten and deduplicate sources by link
             unique_sources = list({source['link']: source for results in all_search_results for source in results}.values())
             if not unique_sources:
+                yield format_sse({"event": "error", "data": "All search queries returned zero usable sources."})
+                return
             sources_to_process = unique_sources[:MAX_SOURCES_TO_PROCESS]
+            yield format_sse({
+                "event": "status",
+                "data": f"Found {len(unique_sources)} unique sources. Processing the top {len(sources_to_process)}..."
+            })
             processing_tasks = [research_and_process_source(session, source) for source in sources_to_process]
+            consolidated_context = ""
+            all_sources_used = []
             for task in asyncio.as_completed(processing_tasks):
                 content, source_info = await task
                 if content:
                     all_sources_used.append(source_info)
             if not consolidated_context.strip():
+                yield format_sse({"event": "error", "data": "Failed to gather any research context."})
+                return
             yield format_sse({"event": "status", "data": "Synthesizing final report..."})
             report_prompt = f'Synthesize the provided context into a long-form, comprehensive, multi-page report on "{query}". Use markdown. Elaborate extensively on each point. Base your entire report ONLY on the provided context.\n\n## Research Context ##\n{consolidated_context}'
+            report_payload = {
+                "model": LLM_MODEL,
+                "messages": [{"role": "user", "content": report_prompt}],
+                "stream": True
+            }
             async with session.post(LLM_API_URL, headers=LLM_HEADERS, json=report_payload) as response:
                 response.raise_for_status()
                 async for line in response.content:
                     line_str = line.decode('utf-8').strip()
+                    if line_str.startswith('data:'):
+                        line_str = line_str[5:].strip()
+                    if line_str == "[DONE]":
+                        break
                     try:
                         chunk = json.loads(line_str)
                         choices = chunk.get("choices")
                         if choices and isinstance(choices, list) and len(choices) > 0:
                             content = choices[0].get("delta", {}).get("content")
                             if content:
                                 yield format_sse({"event": "chunk", "data": content})
+                    except json.JSONDecodeError:
+                        continue
             yield format_sse({"event": "sources", "data": all_sources_used})
     except Exception as e:
         logger.error(f"A critical error occurred: {e}", exc_info=True)
+        yield format_sse({"event": "error", "data": str(e)})