Spaces:

rkihacker
/

Scrap

Paused

App Files Files Community

rkihacker commited on Sep 17

Commit

43aeff7

verified ·

1 Parent(s): a38a28a

Update main.py

Browse files

Files changed (1) hide show

main.py +61 -54

main.py CHANGED Viewed

@@ -1,4 +1,3 @@
 import os
 import asyncio
 import json
@@ -6,6 +5,7 @@ import logging
 import random
 import re
 from typing import AsyncGenerator, Optional, Tuple, List
 from fastapi import FastAPI
 from fastapi.responses import StreamingResponse
@@ -14,7 +14,6 @@ from pydantic import BaseModel
 from dotenv import load_dotenv
 import aiohttp
 from bs4 import BeautifulSoup
-from ddgs import DDGS
 # --- Configuration ---
 logging.basicConfig(level=logging.INFO, format='%(asctime)s - %(levelname)s - %(message)s')
@@ -48,17 +47,11 @@ class DeepResearchRequest(BaseModel):
 app = FastAPI(
     title="AI Deep Research API",
     description="Provides robust, long-form, streaming deep research completions using the DuckDuckGo Search API.",
-    version="9.2.0"  # Robust async client handling
 )
 # Enable CORS for all origins
-app.add_middleware(
-    CORSMiddleware,
-    allow_origins=["*"],
-    allow_credentials=True,
-    allow_methods=["*"],
-    allow_headers=["*"]
-)
 # --- Helper Functions ---
 def extract_json_from_llm_response(text: str) -> Optional[list]:
@@ -72,18 +65,48 @@ def extract_json_from_llm_response(text: str) -> Optional[list]:
 # --- Core Service Functions ---
 async def call_duckduckgo_search(session: aiohttp.ClientSession, query: str, max_results: int = 10) -> List[dict]:
-    """Performs a search using the DDGS API with an existing aiohttp session."""
-    logger.info(f"Searching DuckDuckGo API for: '{query}'")
     try:
-        ddgs = DDGS(session=session)
-        raw_results = [r async for r in ddgs.atext(query, max_results=max_results)]
-        results = [
-            {'title': r.get('title'), 'link': r.get('href'), 'snippet': r.get('body')}
-            for r in raw_results if r.get('href') and r.get('title') and r.get('body')
-        ]
-        logger.info(f"Found {len(results)} sources from DuckDuckGo for: '{query}'")
-        return results
     except Exception as e:
         logger.error(f"DuckDuckGo search failed for query '{query}': {e}", exc_info=True)
         return []
@@ -94,24 +117,17 @@ async def research_and_process_source(session: aiohttp.ClientSession, source: di
         logger.info(f"Scraping: {source['link']}")
         if source['link'].lower().endswith('.pdf'):
             raise ValueError("PDF content")
         async with session.get(source['link'], headers=headers, timeout=10, ssl=False) as response:
             if response.status != 200:
                 raise ValueError(f"HTTP status {response.status}")
             html = await response.text()
             soup = BeautifulSoup(html, "html.parser")
-            # Remove unnecessary tags
             for tag in soup(['script', 'style', 'nav', 'footer', 'header', 'aside']):
                 tag.decompose()
             content = " ".join(soup.stripped_strings)
             if not content.strip():
                 raise ValueError("Parsed content is empty.")
             return content, source
     except Exception as e:
         logger.warning(f"Scraping failed for {source['link']} ({e}). Falling back to snippet.")
         return source.get('snippet', ''), source
@@ -122,10 +138,8 @@ async def run_deep_research_stream(query: str) -> AsyncGenerator[str, None]:
         return f"data: {json.dumps(data)}\n\n"
     try:
-        # Create a single session for all HTTP requests in this stream
         async with aiohttp.ClientSession() as session:
             yield format_sse({"event": "status", "data": "Generating research plan..."})
             plan_prompt = {
                 "model": LLM_MODEL,
                 "messages": [{
@@ -133,7 +147,6 @@ async def run_deep_research_stream(query: str) -> AsyncGenerator[str, None]:
                     "content": f"Generate 3-4 key sub-questions for a research report on '{query}'. Your response MUST be ONLY the raw JSON array. Example: [\"Question 1?\"]"
                 }]
             }
             try:
                 async with session.post(LLM_API_URL, headers=LLM_HEADERS, json=plan_prompt, timeout=25) as response:
                     response.raise_for_status()
@@ -146,13 +159,10 @@ async def run_deep_research_stream(query: str) -> AsyncGenerator[str, None]:
                 return
             yield format_sse({"event": "plan", "data": sub_questions})
-            yield format_sse({"event": "status", "data": f"Searching sources for {len(sub_questions)} topics..."})
-            # Pass the single session to each search task
             search_tasks = [call_duckduckgo_search(session, sq) for sq in sub_questions]
             all_search_results = await asyncio.gather(*search_tasks)
-            # Flatten and deduplicate sources by link
             unique_sources = list({source['link']: source for results in all_search_results for source in results}.values())
             if not unique_sources:
@@ -160,14 +170,10 @@ async def run_deep_research_stream(query: str) -> AsyncGenerator[str, None]:
                 return
             sources_to_process = unique_sources[:MAX_SOURCES_TO_PROCESS]
-            yield format_sse({
-                "event": "status",
-                "data": f"Found {len(unique_sources)} unique sources. Processing the top {len(sources_to_process)}..."
-            })
             processing_tasks = [research_and_process_source(session, source) for source in sources_to_process]
-            consolidated_context = ""
-            all_sources_used = []
             for task in asyncio.as_completed(processing_tasks):
                 content, source_info = await task
@@ -180,30 +186,20 @@ async def run_deep_research_stream(query: str) -> AsyncGenerator[str, None]:
                 return
             yield format_sse({"event": "status", "data": "Synthesizing final report..."})
             report_prompt = f'Synthesize the provided context into a long-form, comprehensive, multi-page report on "{query}". Use markdown. Elaborate extensively on each point. Base your entire report ONLY on the provided context.\n\n## Research Context ##\n{consolidated_context}'
-            report_payload = {
-                "model": LLM_MODEL,
-                "messages": [{"role": "user", "content": report_prompt}],
-                "stream": True
-            }
             async with session.post(LLM_API_URL, headers=LLM_HEADERS, json=report_payload) as response:
                 response.raise_for_status()
                 async for line in response.content:
                     line_str = line.decode('utf-8').strip()
                     if line_str.startswith('data:'):
                         line_str = line_str[5:].strip()
                     if line_str == "[DONE]":
                         break
                     try:
                         chunk = json.loads(line_str)
                         choices = chunk.get("choices")
                         if choices and isinstance(choices, list) and len(choices) > 0:
                             content = choices[0].get("delta", {}).get("content")
                             if content:
@@ -212,7 +208,18 @@ async def run_deep_research_stream(query: str) -> AsyncGenerator[str, None]:
                         continue
             yield format_sse({"event": "sources", "data": all_sources_used})
     except Exception as e:
         logger.error(f"A critical error occurred: {e}", exc_info=True)
-        yield format_sse({"event": "error", "data": str(e)})

 import os
 import asyncio
 import json
 import random
 import re
 from typing import AsyncGenerator, Optional, Tuple, List
+from urllib.parse import unquote
 from fastapi import FastAPI
 from fastapi.responses import StreamingResponse
 from dotenv import load_dotenv
 import aiohttp
 from bs4 import BeautifulSoup
 # --- Configuration ---
 logging.basicConfig(level=logging.INFO, format='%(asctime)s - %(levelname)s - %(message)s')
 app = FastAPI(
     title="AI Deep Research API",
     description="Provides robust, long-form, streaming deep research completions using the DuckDuckGo Search API.",
+    version="9.3.0"  # Using direct DuckDuckGo HTML API
 )
 # Enable CORS for all origins
+app.add_middleware(CORSMiddleware, allow_origins=["*"], allow_credentials=True, allow_methods=["*"], allow_headers=["*"])
 # --- Helper Functions ---
 def extract_json_from_llm_response(text: str) -> Optional[list]:
 # --- Core Service Functions ---
 async def call_duckduckgo_search(session: aiohttp.ClientSession, query: str, max_results: int = 10) -> List[dict]:
+    """Performs a search by directly scraping the DuckDuckGo HTML interface."""
+    logger.info(f"Searching DuckDuckGo for: '{query}'")
+    search_url = "https://html.duckduckgo.com/html/"
+    params = {"q": query}
+    headers = {"User-Agent": random.choice(USER_AGENTS)}
     try:
+        async with session.post(search_url, data=params, headers=headers, ssl=False) as response:
+            if response.status != 200:
+                logger.error(f"DuckDuckGo search failed with status {response.status} for query '{query}'")
+                return []
+            html = await response.text()
+            soup = BeautifulSoup(html, "html.parser")
+            results = []
+            for result in soup.find_all('div', class_='result'):
+                title_elem = result.find('a', class_='result__a')
+                snippet_elem = result.find('a', class_='result__snippet')
+                link_elem = result.find('a', class_='result__url')
+                if title_elem and snippet_elem and link_elem:
+                    # Extract the raw href which is a redirect
+                    raw_href = link_elem.get('href', '')
+                    # The actual URL is in a query parameter 'uddg'
+                    parsed_url_match = re.search(r'uddg=([^&]+)', raw_href)
+                    if parsed_url_match:
+                        # URL decode the extracted URL
+                        link = unquote(parsed_url_match.group(1))
+                    else:
+                        continue # Skip if we can't find the clean URL
+                    title = title_elem.get_text(strip=True)
+                    snippet = snippet_elem.get_text(strip=True)
+                    results.append({'title': title, 'link': link, 'snippet': snippet})
+                    if len(results) >= max_results:
+                        break
+            logger.info(f"Found {len(results)} sources from DuckDuckGo for: '{query}'")
+            return results
     except Exception as e:
         logger.error(f"DuckDuckGo search failed for query '{query}': {e}", exc_info=True)
         return []
         logger.info(f"Scraping: {source['link']}")
         if source['link'].lower().endswith('.pdf'):
             raise ValueError("PDF content")
         async with session.get(source['link'], headers=headers, timeout=10, ssl=False) as response:
             if response.status != 200:
                 raise ValueError(f"HTTP status {response.status}")
             html = await response.text()
             soup = BeautifulSoup(html, "html.parser")
             for tag in soup(['script', 'style', 'nav', 'footer', 'header', 'aside']):
                 tag.decompose()
             content = " ".join(soup.stripped_strings)
             if not content.strip():
                 raise ValueError("Parsed content is empty.")
             return content, source
     except Exception as e:
         logger.warning(f"Scraping failed for {source['link']} ({e}). Falling back to snippet.")
         return source.get('snippet', ''), source
         return f"data: {json.dumps(data)}\n\n"
     try:
         async with aiohttp.ClientSession() as session:
             yield format_sse({"event": "status", "data": "Generating research plan..."})
             plan_prompt = {
                 "model": LLM_MODEL,
                 "messages": [{
                     "content": f"Generate 3-4 key sub-questions for a research report on '{query}'. Your response MUST be ONLY the raw JSON array. Example: [\"Question 1?\"]"
                 }]
             }
             try:
                 async with session.post(LLM_API_URL, headers=LLM_HEADERS, json=plan_prompt, timeout=25) as response:
                     response.raise_for_status()
                 return
             yield format_sse({"event": "plan", "data": sub_questions})
+            yield format_sse({"event": "status", "data": f"Searching sources for {len(sub_questions)} topics..."})
             search_tasks = [call_duckduckgo_search(session, sq) for sq in sub_questions]
             all_search_results = await asyncio.gather(*search_tasks)
             unique_sources = list({source['link']: source for results in all_search_results for source in results}.values())
             if not unique_sources:
                 return
             sources_to_process = unique_sources[:MAX_SOURCES_TO_PROCESS]
+            yield format_sse({"event": "status", "data": f"Found {len(unique_sources)} unique sources. Processing the top {len(sources_to_process)}..."})
             processing_tasks = [research_and_process_source(session, source) for source in sources_to_process]
+            consolidated_context, all_sources_used = "", []
             for task in asyncio.as_completed(processing_tasks):
                 content, source_info = await task
                 return
             yield format_sse({"event": "status", "data": "Synthesizing final report..."})
             report_prompt = f'Synthesize the provided context into a long-form, comprehensive, multi-page report on "{query}". Use markdown. Elaborate extensively on each point. Base your entire report ONLY on the provided context.\n\n## Research Context ##\n{consolidated_context}'
+            report_payload = {"model": LLM_MODEL, "messages": [{"role": "user", "content": report_prompt}], "stream": True}
             async with session.post(LLM_API_URL, headers=LLM_HEADERS, json=report_payload) as response:
                 response.raise_for_status()
                 async for line in response.content:
                     line_str = line.decode('utf-8').strip()
                     if line_str.startswith('data:'):
                         line_str = line_str[5:].strip()
                     if line_str == "[DONE]":
                         break
                     try:
                         chunk = json.loads(line_str)
                         choices = chunk.get("choices")
                         if choices and isinstance(choices, list) and len(choices) > 0:
                             content = choices[0].get("delta", {}).get("content")
                             if content:
                         continue
             yield format_sse({"event": "sources", "data": all_sources_used})
     except Exception as e:
         logger.error(f"A critical error occurred: {e}", exc_info=True)
+        yield format_sse({"event": "error", "data": f"An unexpected error occurred: {str(e)}"})
+@app.post("/deep-research", response_class=StreamingResponse)
+async def deep_research_endpoint(request: DeepResearchRequest):
+    """
+    Accepts a query and streams back a detailed research report.
+    Events: status, plan, chunk, sources, error
+    """
+    return StreamingResponse(run_deep_research_stream(request.query), media_type="text/event-stream")
+if __name__ == "__main__":
+    import uvicorn
+    uvicorn.run(app, host="0.0.0.0", port=8000)