rkihacker commited on
Commit
0eacd1e
·
verified ·
1 Parent(s): 43aeff7

Update main.py

Browse files
Files changed (1) hide show
  1. main.py +27 -49
main.py CHANGED
@@ -5,7 +5,6 @@ import logging
5
  import random
6
  import re
7
  from typing import AsyncGenerator, Optional, Tuple, List
8
- from urllib.parse import unquote
9
 
10
  from fastapi import FastAPI
11
  from fastapi.responses import StreamingResponse
@@ -14,6 +13,7 @@ from pydantic import BaseModel
14
  from dotenv import load_dotenv
15
  import aiohttp
16
  from bs4 import BeautifulSoup
 
17
 
18
  # --- Configuration ---
19
  logging.basicConfig(level=logging.INFO, format='%(asctime)s - %(levelname)s - %(message)s')
@@ -47,7 +47,7 @@ class DeepResearchRequest(BaseModel):
47
  app = FastAPI(
48
  title="AI Deep Research API",
49
  description="Provides robust, long-form, streaming deep research completions using the DuckDuckGo Search API.",
50
- version="9.3.0" # Using direct DuckDuckGo HTML API
51
  )
52
 
53
  # Enable CORS for all origins
@@ -65,52 +65,29 @@ def extract_json_from_llm_response(text: str) -> Optional[list]:
65
 
66
  # --- Core Service Functions ---
67
  async def call_duckduckgo_search(session: aiohttp.ClientSession, query: str, max_results: int = 10) -> List[dict]:
68
- """Performs a search by directly scraping the DuckDuckGo HTML interface."""
69
- logger.info(f"Searching DuckDuckGo for: '{query}'")
70
- search_url = "https://html.duckduckgo.com/html/"
71
- params = {"q": query}
72
- headers = {"User-Agent": random.choice(USER_AGENTS)}
73
-
74
  try:
75
- async with session.post(search_url, data=params, headers=headers, ssl=False) as response:
76
- if response.status != 200:
77
- logger.error(f"DuckDuckGo search failed with status {response.status} for query '{query}'")
78
- return []
79
-
80
- html = await response.text()
81
- soup = BeautifulSoup(html, "html.parser")
82
- results = []
83
-
84
- for result in soup.find_all('div', class_='result'):
85
- title_elem = result.find('a', class_='result__a')
86
- snippet_elem = result.find('a', class_='result__snippet')
87
- link_elem = result.find('a', class_='result__url')
88
-
89
- if title_elem and snippet_elem and link_elem:
90
- # Extract the raw href which is a redirect
91
- raw_href = link_elem.get('href', '')
92
-
93
- # The actual URL is in a query parameter 'uddg'
94
- parsed_url_match = re.search(r'uddg=([^&]+)', raw_href)
95
- if parsed_url_match:
96
- # URL decode the extracted URL
97
- link = unquote(parsed_url_match.group(1))
98
- else:
99
- continue # Skip if we can't find the clean URL
100
-
101
- title = title_elem.get_text(strip=True)
102
- snippet = snippet_elem.get_text(strip=True)
103
-
104
- results.append({'title': title, 'link': link, 'snippet': snippet})
105
- if len(results) >= max_results:
106
- break
107
-
108
- logger.info(f"Found {len(results)} sources from DuckDuckGo for: '{query}'")
109
- return results
110
  except Exception as e:
111
- logger.error(f"DuckDuckGo search failed for query '{query}': {e}", exc_info=True)
112
  return []
113
 
 
114
  async def research_and_process_source(session: aiohttp.ClientSession, source: dict) -> Tuple[str, dict]:
115
  headers = {'User-Agent': random.choice(USER_AGENTS)}
116
  try:
@@ -152,8 +129,8 @@ async def run_deep_research_stream(query: str) -> AsyncGenerator[str, None]:
152
  response.raise_for_status()
153
  result = await response.json()
154
  sub_questions = result if isinstance(result, list) else extract_json_from_llm_response(result['choices'][0]['message']['content'])
155
- if not isinstance(sub_questions, list):
156
- raise ValueError(f"Invalid plan from LLM: {result}")
157
  except Exception as e:
158
  yield format_sse({"event": "error", "data": f"Could not generate research plan. Reason: {e}"})
159
  return
@@ -166,7 +143,7 @@ async def run_deep_research_stream(query: str) -> AsyncGenerator[str, None]:
166
  unique_sources = list({source['link']: source for results in all_search_results for source in results}.values())
167
 
168
  if not unique_sources:
169
- yield format_sse({"event": "error", "data": "All search queries returned zero usable sources."})
170
  return
171
 
172
  sources_to_process = unique_sources[:MAX_SOURCES_TO_PROCESS]
@@ -177,12 +154,12 @@ async def run_deep_research_stream(query: str) -> AsyncGenerator[str, None]:
177
 
178
  for task in asyncio.as_completed(processing_tasks):
179
  content, source_info = await task
180
- if content:
181
  consolidated_context += f"Source: {source_info['link']}\nContent: {content}\n\n---\n\n"
182
  all_sources_used.append(source_info)
183
 
184
  if not consolidated_context.strip():
185
- yield format_sse({"event": "error", "data": "Failed to gather any research context."})
186
  return
187
 
188
  yield format_sse({"event": "status", "data": "Synthesizing final report..."})
@@ -222,4 +199,5 @@ async def deep_research_endpoint(request: DeepResearchRequest):
222
 
223
  if __name__ == "__main__":
224
  import uvicorn
 
225
  uvicorn.run(app, host="0.0.0.0", port=8000)
 
5
  import random
6
  import re
7
  from typing import AsyncGenerator, Optional, Tuple, List
 
8
 
9
  from fastapi import FastAPI
10
  from fastapi.responses import StreamingResponse
 
13
  from dotenv import load_dotenv
14
  import aiohttp
15
  from bs4 import BeautifulSoup
16
+ from ddgs import DDGS # <-- Make sure this import is present
17
 
18
  # --- Configuration ---
19
  logging.basicConfig(level=logging.INFO, format='%(asctime)s - %(levelname)s - %(message)s')
 
47
  app = FastAPI(
48
  title="AI Deep Research API",
49
  description="Provides robust, long-form, streaming deep research completions using the DuckDuckGo Search API.",
50
+ version="9.4.0" # Reverted to reliable DDGS library search
51
  )
52
 
53
  # Enable CORS for all origins
 
65
 
66
  # --- Core Service Functions ---
67
  async def call_duckduckgo_search(session: aiohttp.ClientSession, query: str, max_results: int = 10) -> List[dict]:
68
+ """
69
+ Performs a search using the DDGS library with an existing aiohttp session.
70
+ This method is more reliable than direct HTML scraping.
71
+ """
72
+ logger.info(f"Searching DuckDuckGo API via DDGS for: '{query}'")
 
73
  try:
74
+ ddgs = DDGS(session=session)
75
+ # Use ddgs.atext for asynchronous text search
76
+ raw_results = [r async for r in ddgs.atext(query, max_results=max_results)]
77
+
78
+ # Filter and format results to ensure they have the necessary keys
79
+ results = [
80
+ {'title': r.get('title'), 'link': r.get('href'), 'snippet': r.get('body')}
81
+ for r in raw_results if r.get('href') and r.get('title') and r.get('body')
82
+ ]
83
+
84
+ logger.info(f"Found {len(results)} sources from DuckDuckGo for: '{query}'")
85
+ return results
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
86
  except Exception as e:
87
+ logger.error(f"DDGS search failed for query '{query}': {e}", exc_info=True)
88
  return []
89
 
90
+
91
  async def research_and_process_source(session: aiohttp.ClientSession, source: dict) -> Tuple[str, dict]:
92
  headers = {'User-Agent': random.choice(USER_AGENTS)}
93
  try:
 
129
  response.raise_for_status()
130
  result = await response.json()
131
  sub_questions = result if isinstance(result, list) else extract_json_from_llm_response(result['choices'][0]['message']['content'])
132
+ if not isinstance(sub_questions, list) or not sub_questions:
133
+ raise ValueError(f"Invalid or empty plan from LLM: {result}")
134
  except Exception as e:
135
  yield format_sse({"event": "error", "data": f"Could not generate research plan. Reason: {e}"})
136
  return
 
143
  unique_sources = list({source['link']: source for results in all_search_results for source in results}.values())
144
 
145
  if not unique_sources:
146
+ yield format_sse({"event": "error", "data": f"Could not find any relevant sources for the query '{query}'. Please try a different topic."})
147
  return
148
 
149
  sources_to_process = unique_sources[:MAX_SOURCES_TO_PROCESS]
 
154
 
155
  for task in asyncio.as_completed(processing_tasks):
156
  content, source_info = await task
157
+ if content and content.strip():
158
  consolidated_context += f"Source: {source_info['link']}\nContent: {content}\n\n---\n\n"
159
  all_sources_used.append(source_info)
160
 
161
  if not consolidated_context.strip():
162
+ yield format_sse({"event": "error", "data": "Failed to scrape content from any of the discovered sources."})
163
  return
164
 
165
  yield format_sse({"event": "status", "data": "Synthesizing final report..."})
 
199
 
200
  if __name__ == "__main__":
201
  import uvicorn
202
+ # To run this app: uvicorn your_filename:app --reload
203
  uvicorn.run(app, host="0.0.0.0", port=8000)