Spaces:

rkihacker
/

Scrap

Paused

App Files Files Community

rkihacker commited on Sep 17

Commit

0eacd1e

verified ·

1 Parent(s): 43aeff7

Update main.py

Browse files

Files changed (1) hide show

main.py +27 -49

main.py CHANGED Viewed

@@ -5,7 +5,6 @@ import logging
 import random
 import re
 from typing import AsyncGenerator, Optional, Tuple, List
-from urllib.parse import unquote
 from fastapi import FastAPI
 from fastapi.responses import StreamingResponse
@@ -14,6 +13,7 @@ from pydantic import BaseModel
 from dotenv import load_dotenv
 import aiohttp
 from bs4 import BeautifulSoup
 # --- Configuration ---
 logging.basicConfig(level=logging.INFO, format='%(asctime)s - %(levelname)s - %(message)s')
@@ -47,7 +47,7 @@ class DeepResearchRequest(BaseModel):
 app = FastAPI(
     title="AI Deep Research API",
     description="Provides robust, long-form, streaming deep research completions using the DuckDuckGo Search API.",
-    version="9.3.0"  # Using direct DuckDuckGo HTML API
 )
 # Enable CORS for all origins
@@ -65,52 +65,29 @@ def extract_json_from_llm_response(text: str) -> Optional[list]:
 # --- Core Service Functions ---
 async def call_duckduckgo_search(session: aiohttp.ClientSession, query: str, max_results: int = 10) -> List[dict]:
-    """Performs a search by directly scraping the DuckDuckGo HTML interface."""
-    logger.info(f"Searching DuckDuckGo for: '{query}'")
-    search_url = "https://html.duckduckgo.com/html/"
-    params = {"q": query}
-    headers = {"User-Agent": random.choice(USER_AGENTS)}
     try:
-        async with session.post(search_url, data=params, headers=headers, ssl=False) as response:
-            if response.status != 200:
-                logger.error(f"DuckDuckGo search failed with status {response.status} for query '{query}'")
-                return []
-            html = await response.text()
-            soup = BeautifulSoup(html, "html.parser")
-            results = []
-            for result in soup.find_all('div', class_='result'):
-                title_elem = result.find('a', class_='result__a')
-                snippet_elem = result.find('a', class_='result__snippet')
-                link_elem = result.find('a', class_='result__url')
-                if title_elem and snippet_elem and link_elem:
-                    # Extract the raw href which is a redirect
-                    raw_href = link_elem.get('href', '')
-                    # The actual URL is in a query parameter 'uddg'
-                    parsed_url_match = re.search(r'uddg=([^&]+)', raw_href)
-                    if parsed_url_match:
-                        # URL decode the extracted URL
-                        link = unquote(parsed_url_match.group(1))
-                    else:
-                        continue # Skip if we can't find the clean URL
-                    title = title_elem.get_text(strip=True)
-                    snippet = snippet_elem.get_text(strip=True)
-                    results.append({'title': title, 'link': link, 'snippet': snippet})
-                    if len(results) >= max_results:
-                        break
-            logger.info(f"Found {len(results)} sources from DuckDuckGo for: '{query}'")
-            return results
     except Exception as e:
-        logger.error(f"DuckDuckGo search failed for query '{query}': {e}", exc_info=True)
         return []
 async def research_and_process_source(session: aiohttp.ClientSession, source: dict) -> Tuple[str, dict]:
     headers = {'User-Agent': random.choice(USER_AGENTS)}
     try:
@@ -152,8 +129,8 @@ async def run_deep_research_stream(query: str) -> AsyncGenerator[str, None]:
                     response.raise_for_status()
                     result = await response.json()
                     sub_questions = result if isinstance(result, list) else extract_json_from_llm_response(result['choices'][0]['message']['content'])
-                    if not isinstance(sub_questions, list):
-                        raise ValueError(f"Invalid plan from LLM: {result}")
             except Exception as e:
                 yield format_sse({"event": "error", "data": f"Could not generate research plan. Reason: {e}"})
                 return
@@ -166,7 +143,7 @@ async def run_deep_research_stream(query: str) -> AsyncGenerator[str, None]:
             unique_sources = list({source['link']: source for results in all_search_results for source in results}.values())
             if not unique_sources:
-                yield format_sse({"event": "error", "data": "All search queries returned zero usable sources."})
                 return
             sources_to_process = unique_sources[:MAX_SOURCES_TO_PROCESS]
@@ -177,12 +154,12 @@ async def run_deep_research_stream(query: str) -> AsyncGenerator[str, None]:
             for task in asyncio.as_completed(processing_tasks):
                 content, source_info = await task
-                if content:
                     consolidated_context += f"Source: {source_info['link']}\nContent: {content}\n\n---\n\n"
                     all_sources_used.append(source_info)
             if not consolidated_context.strip():
-                yield format_sse({"event": "error", "data": "Failed to gather any research context."})
                 return
             yield format_sse({"event": "status", "data": "Synthesizing final report..."})
@@ -222,4 +199,5 @@ async def deep_research_endpoint(request: DeepResearchRequest):
 if __name__ == "__main__":
     import uvicorn
     uvicorn.run(app, host="0.0.0.0", port=8000)

 import random
 import re
 from typing import AsyncGenerator, Optional, Tuple, List
 from fastapi import FastAPI
 from fastapi.responses import StreamingResponse
 from dotenv import load_dotenv
 import aiohttp
 from bs4 import BeautifulSoup
+from ddgs import DDGS # <-- Make sure this import is present
 # --- Configuration ---
 logging.basicConfig(level=logging.INFO, format='%(asctime)s - %(levelname)s - %(message)s')
 app = FastAPI(
     title="AI Deep Research API",
     description="Provides robust, long-form, streaming deep research completions using the DuckDuckGo Search API.",
+    version="9.4.0"  # Reverted to reliable DDGS library search
 )
 # Enable CORS for all origins
 # --- Core Service Functions ---
 async def call_duckduckgo_search(session: aiohttp.ClientSession, query: str, max_results: int = 10) -> List[dict]:
+    """
+    Performs a search using the DDGS library with an existing aiohttp session.
+    This method is more reliable than direct HTML scraping.
+    """
+    logger.info(f"Searching DuckDuckGo API via DDGS for: '{query}'")
     try:
+        ddgs = DDGS(session=session)
+        # Use ddgs.atext for asynchronous text search
+        raw_results = [r async for r in ddgs.atext(query, max_results=max_results)]
+        # Filter and format results to ensure they have the necessary keys
+        results = [
+            {'title': r.get('title'), 'link': r.get('href'), 'snippet': r.get('body')}
+            for r in raw_results if r.get('href') and r.get('title') and r.get('body')
+        ]
+        logger.info(f"Found {len(results)} sources from DuckDuckGo for: '{query}'")
+        return results
     except Exception as e:
+        logger.error(f"DDGS search failed for query '{query}': {e}", exc_info=True)
         return []
 async def research_and_process_source(session: aiohttp.ClientSession, source: dict) -> Tuple[str, dict]:
     headers = {'User-Agent': random.choice(USER_AGENTS)}
     try:
                     response.raise_for_status()
                     result = await response.json()
                     sub_questions = result if isinstance(result, list) else extract_json_from_llm_response(result['choices'][0]['message']['content'])
+                    if not isinstance(sub_questions, list) or not sub_questions:
+                        raise ValueError(f"Invalid or empty plan from LLM: {result}")
             except Exception as e:
                 yield format_sse({"event": "error", "data": f"Could not generate research plan. Reason: {e}"})
                 return
             unique_sources = list({source['link']: source for results in all_search_results for source in results}.values())
             if not unique_sources:
+                yield format_sse({"event": "error", "data": f"Could not find any relevant sources for the query '{query}'. Please try a different topic."})
                 return
             sources_to_process = unique_sources[:MAX_SOURCES_TO_PROCESS]
             for task in asyncio.as_completed(processing_tasks):
                 content, source_info = await task
+                if content and content.strip():
                     consolidated_context += f"Source: {source_info['link']}\nContent: {content}\n\n---\n\n"
                     all_sources_used.append(source_info)
             if not consolidated_context.strip():
+                yield format_sse({"event": "error", "data": "Failed to scrape content from any of the discovered sources."})
                 return
             yield format_sse({"event": "status", "data": "Synthesizing final report..."})
 if __name__ == "__main__":
     import uvicorn
+    # To run this app: uvicorn your_filename:app --reload
     uvicorn.run(app, host="0.0.0.0", port=8000)