Spaces:

rkihacker
/

Scrap

Paused

App Files Files Community

rkihacker commited on Sep 17

Commit

768d891

verified ·

1 Parent(s): 0eacd1e

Update main.py

Browse files

Files changed (1) hide show

main.py +46 -20

main.py CHANGED Viewed

@@ -5,6 +5,7 @@ import logging
 import random
 import re
 from typing import AsyncGenerator, Optional, Tuple, List
 from fastapi import FastAPI
 from fastapi.responses import StreamingResponse
@@ -13,7 +14,6 @@ from pydantic import BaseModel
 from dotenv import load_dotenv
 import aiohttp
 from bs4 import BeautifulSoup
-from ddgs import DDGS # <-- Make sure this import is present
 # --- Configuration ---
 logging.basicConfig(level=logging.INFO, format='%(asctime)s - %(levelname)s - %(message)s')
@@ -46,8 +46,8 @@ class DeepResearchRequest(BaseModel):
 app = FastAPI(
     title="AI Deep Research API",
-    description="Provides robust, long-form, streaming deep research completions using the DuckDuckGo Search API.",
-    version="9.4.0"  # Reverted to reliable DDGS library search
 )
 # Enable CORS for all origins
@@ -66,25 +66,52 @@ def extract_json_from_llm_response(text: str) -> Optional[list]:
 # --- Core Service Functions ---
 async def call_duckduckgo_search(session: aiohttp.ClientSession, query: str, max_results: int = 10) -> List[dict]:
     """
-    Performs a search using the DDGS library with an existing aiohttp session.
-    This method is more reliable than direct HTML scraping.
     """
-    logger.info(f"Searching DuckDuckGo API via DDGS for: '{query}'")
     try:
-        ddgs = DDGS(session=session)
-        # Use ddgs.atext for asynchronous text search
-        raw_results = [r async for r in ddgs.atext(query, max_results=max_results)]
-        # Filter and format results to ensure they have the necessary keys
-        results = [
-            {'title': r.get('title'), 'link': r.get('href'), 'snippet': r.get('body')}
-            for r in raw_results if r.get('href') and r.get('title') and r.get('body')
-        ]
-        logger.info(f"Found {len(results)} sources from DuckDuckGo for: '{query}'")
-        return results
     except Exception as e:
-        logger.error(f"DDGS search failed for query '{query}': {e}", exc_info=True)
         return []
@@ -199,5 +226,4 @@ async def deep_research_endpoint(request: DeepResearchRequest):
 if __name__ == "__main__":
     import uvicorn
-    # To run this app: uvicorn your_filename:app --reload
     uvicorn.run(app, host="0.0.0.0", port=8000)

 import random
 import re
 from typing import AsyncGenerator, Optional, Tuple, List
+from urllib.parse import unquote
 from fastapi import FastAPI
 from fastapi.responses import StreamingResponse
 from dotenv import load_dotenv
 import aiohttp
 from bs4 import BeautifulSoup
 # --- Configuration ---
 logging.basicConfig(level=logging.INFO, format='%(asctime)s - %(levelname)s - %(message)s')
 app = FastAPI(
     title="AI Deep Research API",
+    description="Provides robust, long-form, streaming deep research completions using direct DuckDuckGo scraping.",
+    version="9.5.0"  # Implemented direct HTML scraping
 )
 # Enable CORS for all origins
 # --- Core Service Functions ---
 async def call_duckduckgo_search(session: aiohttp.ClientSession, query: str, max_results: int = 10) -> List[dict]:
     """
+    Performs a search by directly scraping the DuckDuckGo HTML interface,
+    mimicking a real browser request.
     """
+    logger.info(f"Searching DuckDuckGo (HTML) for: '{query}'")
+    search_url = "https://html.duckduckgo.com/html/"
+    # Form data to be sent with the POST request
+    payload = {'q': query, 'b': '', 'kl': '', 'df': ''}
+    # Headers to mimic a browser, based on the provided curl command
+    headers = {
+        'Content-Type': 'application/x-www-form-urlencoded',
+        'Origin': 'https://html.duckduckgo.com',
+        'Referer': 'https://html.duckduckgo.com/',
+        'User-Agent': random.choice(USER_AGENTS)
+    }
     try:
+        async with session.post(search_url, data=payload, headers=headers, ssl=False) as response:
+            if response.status != 200:
+                logger.error(f"DuckDuckGo search failed with status {response.status} for query '{query}'")
+                return []
+            html = await response.text()
+            soup = BeautifulSoup(html, "html.parser")
+            results = []
+            # Find all result containers
+            for result_div in soup.find_all('div', class_='result'):
+                title_elem = result_div.find('a', class_='result__a')
+                snippet_elem = result_div.find('a', class_='result__snippet')
+                if title_elem and snippet_elem:
+                    link = title_elem.get('href')
+                    title = title_elem.get_text(strip=True)
+                    snippet = snippet_elem.get_text(strip=True)
+                    if link and title and snippet:
+                        results.append({'title': title, 'link': link, 'snippet': snippet})
+                        if len(results) >= max_results:
+                            break
+            logger.info(f"Found {len(results)} sources from DuckDuckGo for: '{query}'")
+            return results
     except Exception as e:
+        logger.error(f"DuckDuckGo HTML search failed for query '{query}': {e}", exc_info=True)
         return []
 if __name__ == "__main__":
     import uvicorn
     uvicorn.run(app, host="0.0.0.0", port=8000)