Spaces:

rkihacker
/

Scrap

Paused

App Files Files Community

rkihacker commited on Sep 17

Commit

9c44d7d

verified ·

1 Parent(s): 768d891

Update main.py

Browse files

Files changed (1) hide show

main.py +25 -48

main.py CHANGED Viewed

@@ -5,7 +5,6 @@ import logging
 import random
 import re
 from typing import AsyncGenerator, Optional, Tuple, List
-from urllib.parse import unquote
 from fastapi import FastAPI
 from fastapi.responses import StreamingResponse
@@ -14,6 +13,7 @@ from pydantic import BaseModel
 from dotenv import load_dotenv
 import aiohttp
 from bs4 import BeautifulSoup
 # --- Configuration ---
 logging.basicConfig(level=logging.INFO, format='%(asctime)s - %(levelname)s - %(message)s')
@@ -46,8 +46,8 @@ class DeepResearchRequest(BaseModel):
 app = FastAPI(
     title="AI Deep Research API",
-    description="Provides robust, long-form, streaming deep research completions using direct DuckDuckGo scraping.",
-    version="9.5.0"  # Implemented direct HTML scraping
 )
 # Enable CORS for all origins
@@ -64,54 +64,30 @@ def extract_json_from_llm_response(text: str) -> Optional[list]:
     return None
 # --- Core Service Functions ---
-async def call_duckduckgo_search(session: aiohttp.ClientSession, query: str, max_results: int = 10) -> List[dict]:
     """
-    Performs a search by directly scraping the DuckDuckGo HTML interface,
-    mimicking a real browser request.
     """
-    logger.info(f"Searching DuckDuckGo (HTML) for: '{query}'")
-    search_url = "https://html.duckduckgo.com/html/"
-    # Form data to be sent with the POST request
-    payload = {'q': query, 'b': '', 'kl': '', 'df': ''}
-    # Headers to mimic a browser, based on the provided curl command
-    headers = {
-        'Content-Type': 'application/x-www-form-urlencoded',
-        'Origin': 'https://html.duckduckgo.com',
-        'Referer': 'https://html.duckduckgo.com/',
-        'User-Agent': random.choice(USER_AGENTS)
-    }
     try:
-        async with session.post(search_url, data=payload, headers=headers, ssl=False) as response:
-            if response.status != 200:
-                logger.error(f"DuckDuckGo search failed with status {response.status} for query '{query}'")
-                return []
-            html = await response.text()
-            soup = BeautifulSoup(html, "html.parser")
-            results = []
-            # Find all result containers
-            for result_div in soup.find_all('div', class_='result'):
-                title_elem = result_div.find('a', class_='result__a')
-                snippet_elem = result_div.find('a', class_='result__snippet')
-                if title_elem and snippet_elem:
-                    link = title_elem.get('href')
-                    title = title_elem.get_text(strip=True)
-                    snippet = snippet_elem.get_text(strip=True)
-                    if link and title and snippet:
-                        results.append({'title': title, 'link': link, 'snippet': snippet})
-                        if len(results) >= max_results:
-                            break
-            logger.info(f"Found {len(results)} sources from DuckDuckGo for: '{query}'")
-            return results
     except Exception as e:
-        logger.error(f"DuckDuckGo HTML search failed for query '{query}': {e}", exc_info=True)
         return []
@@ -165,7 +141,8 @@ async def run_deep_research_stream(query: str) -> AsyncGenerator[str, None]:
             yield format_sse({"event": "plan", "data": sub_questions})
             yield format_sse({"event": "status", "data": f"Searching sources for {len(sub_questions)} topics..."})
-            search_tasks = [call_duckduckgo_search(session, sq) for sq in sub_questions]
             all_search_results = await asyncio.gather(*search_tasks)
             unique_sources = list({source['link']: source for results in all_search_results for source in results}.values())

 import random
 import re
 from typing import AsyncGenerator, Optional, Tuple, List
 from fastapi import FastAPI
 from fastapi.responses import StreamingResponse
 from dotenv import load_dotenv
 import aiohttp
 from bs4 import BeautifulSoup
+from ddgs import DDGS # Ensure this library is installed: pip install duckduckgo-search
 # --- Configuration ---
 logging.basicConfig(level=logging.INFO, format='%(asctime)s - %(levelname)s - %(message)s')
 app = FastAPI(
     title="AI Deep Research API",
+    description="Provides robust, long-form, streaming deep research completions using the DuckDuckGo Search API.",
+    version="9.6.0"  # Correctly implemented DDGS library for robust searching
 )
 # Enable CORS for all origins
     return None
 # --- Core Service Functions ---
+async def call_duckduckgo_search(query: str, max_results: int = 10) -> List[dict]:
     """
+    Performs a search using the DDGS library, correctly handling async operations.
+    This is the most reliable method.
     """
+    logger.info(f"Searching DuckDuckGo API via DDGS for: '{query}'")
     try:
+        results = []
+        # Use 'async with' to let the library manage its own session lifecycle
+        async with DDGS() as ddgs:
+            # The ddgs.atext() is an async generator
+            async for r in ddgs.atext(query, max_results=max_results):
+                results.append(r)
+        # The library now returns a dict with 'title', 'href', and 'body'
+        formatted_results = [
+            {'title': r.get('title'), 'link': r.get('href'), 'snippet': r.get('body')}
+            for r in results if r.get('href') and r.get('title') and r.get('body')
+        ]
+        logger.info(f"Found {len(formatted_results)} sources from DuckDuckGo for: '{query}'")
+        return formatted_results
     except Exception as e:
+        logger.error(f"DDGS search failed for query '{query}': {e}", exc_info=True)
         return []
             yield format_sse({"event": "plan", "data": sub_questions})
             yield format_sse({"event": "status", "data": f"Searching sources for {len(sub_questions)} topics..."})
+            # Note: We no longer pass the 'session' object to the search function
+            search_tasks = [call_duckduckgo_search(sq) for sq in sub_questions]
             all_search_results = await asyncio.gather(*search_tasks)
             unique_sources = list({source['link']: source for results in all_search_results for source in results}.values())