rkihacker commited on
Commit
9c44d7d
·
verified ·
1 Parent(s): 768d891

Update main.py

Browse files
Files changed (1) hide show
  1. main.py +25 -48
main.py CHANGED
@@ -5,7 +5,6 @@ import logging
5
  import random
6
  import re
7
  from typing import AsyncGenerator, Optional, Tuple, List
8
- from urllib.parse import unquote
9
 
10
  from fastapi import FastAPI
11
  from fastapi.responses import StreamingResponse
@@ -14,6 +13,7 @@ from pydantic import BaseModel
14
  from dotenv import load_dotenv
15
  import aiohttp
16
  from bs4 import BeautifulSoup
 
17
 
18
  # --- Configuration ---
19
  logging.basicConfig(level=logging.INFO, format='%(asctime)s - %(levelname)s - %(message)s')
@@ -46,8 +46,8 @@ class DeepResearchRequest(BaseModel):
46
 
47
  app = FastAPI(
48
  title="AI Deep Research API",
49
- description="Provides robust, long-form, streaming deep research completions using direct DuckDuckGo scraping.",
50
- version="9.5.0" # Implemented direct HTML scraping
51
  )
52
 
53
  # Enable CORS for all origins
@@ -64,54 +64,30 @@ def extract_json_from_llm_response(text: str) -> Optional[list]:
64
  return None
65
 
66
  # --- Core Service Functions ---
67
- async def call_duckduckgo_search(session: aiohttp.ClientSession, query: str, max_results: int = 10) -> List[dict]:
68
  """
69
- Performs a search by directly scraping the DuckDuckGo HTML interface,
70
- mimicking a real browser request.
71
  """
72
- logger.info(f"Searching DuckDuckGo (HTML) for: '{query}'")
73
- search_url = "https://html.duckduckgo.com/html/"
74
-
75
- # Form data to be sent with the POST request
76
- payload = {'q': query, 'b': '', 'kl': '', 'df': ''}
77
-
78
- # Headers to mimic a browser, based on the provided curl command
79
- headers = {
80
- 'Content-Type': 'application/x-www-form-urlencoded',
81
- 'Origin': 'https://html.duckduckgo.com',
82
- 'Referer': 'https://html.duckduckgo.com/',
83
- 'User-Agent': random.choice(USER_AGENTS)
84
- }
85
-
86
  try:
87
- async with session.post(search_url, data=payload, headers=headers, ssl=False) as response:
88
- if response.status != 200:
89
- logger.error(f"DuckDuckGo search failed with status {response.status} for query '{query}'")
90
- return []
91
-
92
- html = await response.text()
93
- soup = BeautifulSoup(html, "html.parser")
94
- results = []
95
-
96
- # Find all result containers
97
- for result_div in soup.find_all('div', class_='result'):
98
- title_elem = result_div.find('a', class_='result__a')
99
- snippet_elem = result_div.find('a', class_='result__snippet')
100
-
101
- if title_elem and snippet_elem:
102
- link = title_elem.get('href')
103
- title = title_elem.get_text(strip=True)
104
- snippet = snippet_elem.get_text(strip=True)
105
-
106
- if link and title and snippet:
107
- results.append({'title': title, 'link': link, 'snippet': snippet})
108
- if len(results) >= max_results:
109
- break
110
-
111
- logger.info(f"Found {len(results)} sources from DuckDuckGo for: '{query}'")
112
- return results
113
  except Exception as e:
114
- logger.error(f"DuckDuckGo HTML search failed for query '{query}': {e}", exc_info=True)
115
  return []
116
 
117
 
@@ -165,7 +141,8 @@ async def run_deep_research_stream(query: str) -> AsyncGenerator[str, None]:
165
  yield format_sse({"event": "plan", "data": sub_questions})
166
 
167
  yield format_sse({"event": "status", "data": f"Searching sources for {len(sub_questions)} topics..."})
168
- search_tasks = [call_duckduckgo_search(session, sq) for sq in sub_questions]
 
169
  all_search_results = await asyncio.gather(*search_tasks)
170
  unique_sources = list({source['link']: source for results in all_search_results for source in results}.values())
171
 
 
5
  import random
6
  import re
7
  from typing import AsyncGenerator, Optional, Tuple, List
 
8
 
9
  from fastapi import FastAPI
10
  from fastapi.responses import StreamingResponse
 
13
  from dotenv import load_dotenv
14
  import aiohttp
15
  from bs4 import BeautifulSoup
16
+ from ddgs import DDGS # Ensure this library is installed: pip install duckduckgo-search
17
 
18
  # --- Configuration ---
19
  logging.basicConfig(level=logging.INFO, format='%(asctime)s - %(levelname)s - %(message)s')
 
46
 
47
  app = FastAPI(
48
  title="AI Deep Research API",
49
+ description="Provides robust, long-form, streaming deep research completions using the DuckDuckGo Search API.",
50
+ version="9.6.0" # Correctly implemented DDGS library for robust searching
51
  )
52
 
53
  # Enable CORS for all origins
 
64
  return None
65
 
66
  # --- Core Service Functions ---
67
+ async def call_duckduckgo_search(query: str, max_results: int = 10) -> List[dict]:
68
  """
69
+ Performs a search using the DDGS library, correctly handling async operations.
70
+ This is the most reliable method.
71
  """
72
+ logger.info(f"Searching DuckDuckGo API via DDGS for: '{query}'")
 
 
 
 
 
 
 
 
 
 
 
 
 
73
  try:
74
+ results = []
75
+ # Use 'async with' to let the library manage its own session lifecycle
76
+ async with DDGS() as ddgs:
77
+ # The ddgs.atext() is an async generator
78
+ async for r in ddgs.atext(query, max_results=max_results):
79
+ results.append(r)
80
+
81
+ # The library now returns a dict with 'title', 'href', and 'body'
82
+ formatted_results = [
83
+ {'title': r.get('title'), 'link': r.get('href'), 'snippet': r.get('body')}
84
+ for r in results if r.get('href') and r.get('title') and r.get('body')
85
+ ]
86
+
87
+ logger.info(f"Found {len(formatted_results)} sources from DuckDuckGo for: '{query}'")
88
+ return formatted_results
 
 
 
 
 
 
 
 
 
 
 
89
  except Exception as e:
90
+ logger.error(f"DDGS search failed for query '{query}': {e}", exc_info=True)
91
  return []
92
 
93
 
 
141
  yield format_sse({"event": "plan", "data": sub_questions})
142
 
143
  yield format_sse({"event": "status", "data": f"Searching sources for {len(sub_questions)} topics..."})
144
+ # Note: We no longer pass the 'session' object to the search function
145
+ search_tasks = [call_duckduckgo_search(sq) for sq in sub_questions]
146
  all_search_results = await asyncio.gather(*search_tasks)
147
  unique_sources = list({source['link']: source for results in all_search_results for source in results}.values())
148