rkihacker commited on
Commit
64f616b
·
verified ·
1 Parent(s): 4906187

Update main.py

Browse files
Files changed (1) hide show
  1. main.py +49 -68
main.py CHANGED
@@ -29,13 +29,9 @@ else:
29
  # --- Constants & Headers ---
30
  LLM_API_URL = "https://api.typegpt.net/v1/chat/completions"
31
  LLM_MODEL = "gpt-4.1-mini"
 
32
 
33
- # Automatic Context Sizing
34
- TARGET_TOKEN_LIMIT = 28000
35
- ESTIMATED_CHARS_PER_TOKEN = 4
36
- MAX_CONTEXT_CHAR_LENGTH = TARGET_TOKEN_LIMIT * ESTIMATED_CHARS_PER_TOKEN
37
-
38
- # Real Browser User Agents for Rotation (Used for both search and scraping)
39
  USER_AGENTS = [
40
  "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/128.0.0.0 Safari/537.36",
41
  "Mozilla/5.0 (Macintosh; Intel Mac OS X 10_15_7) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/128.0.0.0 Safari/537.36",
@@ -56,71 +52,40 @@ def extract_json_from_llm_response(text: str) -> Optional[list]:
56
 
57
  app = FastAPI(
58
  title="AI Deep Research API",
59
- description="Provides robust, streaming deep research completions using DuckDuckGo Search.",
60
- version="5.0.0" # Final Production Version with new Search Provider
61
  )
62
 
63
  # --- Core Service Functions ---
64
-
65
- # ***** THE NEW SEARCH FUNCTION *****
66
  async def call_duckduckgo_search(session: aiohttp.ClientSession, query: str) -> List[dict]:
67
- """Performs a search using DuckDuckGo's HTML interface and parses the results."""
68
  search_url = f"https://html.duckduckgo.com/html/?q={quote_plus(query)}"
69
- logger.info(f"Searching DuckDuckGo for: '{query}'")
70
  headers = {'User-Agent': random.choice(USER_AGENTS)}
71
-
72
  try:
73
  async with session.get(search_url, headers=headers, timeout=15) as response:
74
- response.raise_for_status()
75
- html = await response.text()
76
- soup = BeautifulSoup(html, "html.parser")
77
-
78
- results = []
79
- # Find all result containers, which have a class 'result'
80
- for result_container in soup.find_all('div', class_='result'):
81
- title_tag = result_container.find('a', class_='result__a')
82
- snippet_tag = result_container.find('a', class_='result__snippet')
83
-
84
- if title_tag and snippet_tag and title_tag.has_attr('href'):
85
- # The link in DDG's HTML version is a redirect, so we need to clean it
86
- raw_link = title_tag['href']
87
- cleaned_link = re.sub(r'/l/\?kh=-1&uddg=', '', raw_link)
88
-
89
- results.append({
90
- 'title': title_tag.get_text(strip=True),
91
- 'link': cleaned_link,
92
- 'snippet': snippet_tag.get_text(strip=True)
93
- })
94
-
95
  logger.info(f"Found {len(results)} sources from DuckDuckGo for: '{query}'")
96
  return results
97
  except Exception as e:
98
  logger.error(f"DuckDuckGo search failed for query '{query}': {e}"); return []
99
 
100
-
101
  async def research_and_process_source(session: aiohttp.ClientSession, source: dict) -> Tuple[str, dict]:
102
- """Scrapes a single source and falls back to its snippet if scraping fails."""
103
- logger.info(f"Processing source: {source['link']}")
104
  headers = {'User-Agent': random.choice(USER_AGENTS)}
105
-
106
  try:
107
- if source['link'].lower().endswith('.pdf'):
108
- raise ValueError("PDF content cannot be scraped.")
109
-
110
  async with session.get(source['link'], headers=headers, timeout=10, ssl=False) as response:
111
- if response.status != 200:
112
- raise ValueError(f"HTTP status {response.status}")
113
-
114
  html = await response.text()
115
  soup = BeautifulSoup(html, "html.parser")
116
  for tag in soup(['script', 'style', 'nav', 'footer', 'header', 'aside']): tag.decompose()
117
  content = " ".join(soup.stripped_strings)
118
-
119
- if not content.strip(): # Check if parsed content is empty
120
- raise ValueError("Parsed content is empty.")
121
-
122
  return content, source
123
-
124
  except Exception as e:
125
  logger.warning(f"Scraping failed for {source['link']} ({e}). Falling back to snippet.")
126
  return source.get('snippet', ''), source
@@ -143,40 +108,49 @@ async def run_deep_research_stream(query: str) -> AsyncGenerator[str, None]:
143
 
144
  yield format_sse({"event": "plan", "data": sub_questions})
145
 
146
- # Step 2: Conduct Research in Parallel using DuckDuckGo
147
  yield format_sse({"event": "status", "data": f"Searching sources for {len(sub_questions)} topics..."})
148
  search_tasks = [call_duckduckgo_search(session, sq) for sq in sub_questions]
149
  all_search_results = await asyncio.gather(*search_tasks)
150
 
151
- unique_sources = list({source['link']: source for results in all_search_results for source in results if 'link' in source and 'snippet' in source}.values())
152
 
153
  if not unique_sources:
154
- yield format_sse({"event": "error", "data": "All search queries returned zero usable sources from DuckDuckGo."}); return
155
 
156
- yield format_sse({"event": "status", "data": f"Found {len(unique_sources)} unique sources. Processing..."})
 
 
157
 
158
- processing_tasks = [research_and_process_source(session, source) for source in unique_sources]
159
  consolidated_context, all_sources_used = "", []
160
- successful_scrapes = 0
161
 
162
  for task in asyncio.as_completed(processing_tasks):
163
  content, source_info = await task
164
  if content:
165
  consolidated_context += f"Source: {source_info['link']}\nContent: {content}\n\n---\n\n"
166
  all_sources_used.append(source_info)
167
- if not content == source_info.get('snippet'): successful_scrapes += 1
168
-
169
- logger.info(f"Context complete. Scraped {successful_scrapes}/{len(unique_sources)} pages. Used {len(all_sources_used)} total sources (with snippet fallbacks).")
170
 
171
  if not consolidated_context.strip():
172
- yield format_sse({"event": "error", "data": "Failed to gather any research context from scraping or snippets."}); return
173
 
174
- # Step 3: Synthesize Final Report
175
  yield format_sse({"event": "status", "data": "Synthesizing final report..."})
176
- if len(consolidated_context) > MAX_CONTEXT_CHAR_LENGTH:
177
- consolidated_context = consolidated_context[:MAX_CONTEXT_CHAR_LENGTH]
178
-
179
- report_prompt = f'Synthesize the provided context into a comprehensive, well-structured report on "{query}". Use markdown. Context:\n{consolidated_context}'
 
 
 
 
 
 
 
 
 
 
 
180
  report_payload = {"model": LLM_MODEL, "messages": [{"role": "user", "content": report_prompt}], "stream": True}
181
 
182
  async with session.post(LLM_API_URL, headers=LLM_HEADERS, json=report_payload) as response:
@@ -185,12 +159,19 @@ async def run_deep_research_stream(query: str) -> AsyncGenerator[str, None]:
185
  line_str = line.decode('utf-8').strip()
186
  if line_str.startswith('data:'): line_str = line_str[5:].strip()
187
  if line_str == "[DONE]": break
 
 
188
  try:
189
  chunk = json.loads(line_str)
190
- content = chunk.get("choices", [{}])[0].get("delta", {}).get("content")
191
- if content: yield format_sse({"event": "chunk", "data": content})
192
- except json.JSONDecodeError: continue
193
-
 
 
 
 
 
194
  yield format_sse({"event": "sources", "data": all_sources_used})
195
  except Exception as e:
196
  logger.error(f"A critical error occurred: {e}", exc_info=True)
 
29
  # --- Constants & Headers ---
30
  LLM_API_URL = "https://api.typegpt.net/v1/chat/completions"
31
  LLM_MODEL = "gpt-4.1-mini"
32
+ MAX_SOURCES_TO_PROCESS = 15 # Increase research depth for longer reports
33
 
34
+ # Real Browser User Agents for Rotation
 
 
 
 
 
35
  USER_AGENTS = [
36
  "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/128.0.0.0 Safari/537.36",
37
  "Mozilla/5.0 (Macintosh; Intel Mac OS X 10_15_7) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/128.0.0.0 Safari/537.36",
 
52
 
53
  app = FastAPI(
54
  title="AI Deep Research API",
55
+ description="Provides robust, long-form, streaming deep research completions.",
56
+ version="6.0.0" # Final Production Version
57
  )
58
 
59
  # --- Core Service Functions ---
 
 
60
  async def call_duckduckgo_search(session: aiohttp.ClientSession, query: str) -> List[dict]:
 
61
  search_url = f"https://html.duckduckgo.com/html/?q={quote_plus(query)}"
 
62
  headers = {'User-Agent': random.choice(USER_AGENTS)}
 
63
  try:
64
  async with session.get(search_url, headers=headers, timeout=15) as response:
65
+ response.raise_for_status(); html = await response.text()
66
+ soup = BeautifulSoup(html, "html.parser"); results = []
67
+ for res in soup.find_all('div', class_='result'):
68
+ title_tag, snippet_tag = res.find('a', class_='result__a'), res.find('a', class_='result__snippet')
69
+ if title_tag and snippet_tag and 'href' in title_tag.attrs:
70
+ cleaned_link = re.sub(r'/l/\?kh=-1&uddg=', '', title_tag['href'])
71
+ results.append({'title': title_tag.text, 'link': cleaned_link, 'snippet': snippet_tag.text})
 
 
 
 
 
 
 
 
 
 
 
 
 
 
72
  logger.info(f"Found {len(results)} sources from DuckDuckGo for: '{query}'")
73
  return results
74
  except Exception as e:
75
  logger.error(f"DuckDuckGo search failed for query '{query}': {e}"); return []
76
 
 
77
  async def research_and_process_source(session: aiohttp.ClientSession, source: dict) -> Tuple[str, dict]:
 
 
78
  headers = {'User-Agent': random.choice(USER_AGENTS)}
 
79
  try:
80
+ if source['link'].lower().endswith('.pdf'): raise ValueError("PDF content")
 
 
81
  async with session.get(source['link'], headers=headers, timeout=10, ssl=False) as response:
82
+ if response.status != 200: raise ValueError(f"HTTP status {response.status}")
 
 
83
  html = await response.text()
84
  soup = BeautifulSoup(html, "html.parser")
85
  for tag in soup(['script', 'style', 'nav', 'footer', 'header', 'aside']): tag.decompose()
86
  content = " ".join(soup.stripped_strings)
87
+ if not content.strip(): raise ValueError("Parsed content is empty.")
 
 
 
88
  return content, source
 
89
  except Exception as e:
90
  logger.warning(f"Scraping failed for {source['link']} ({e}). Falling back to snippet.")
91
  return source.get('snippet', ''), source
 
108
 
109
  yield format_sse({"event": "plan", "data": sub_questions})
110
 
111
+ # Step 2: Conduct Deep Research
112
  yield format_sse({"event": "status", "data": f"Searching sources for {len(sub_questions)} topics..."})
113
  search_tasks = [call_duckduckgo_search(session, sq) for sq in sub_questions]
114
  all_search_results = await asyncio.gather(*search_tasks)
115
 
116
+ unique_sources = list({source['link']: source for results in all_search_results for source in results}.values())
117
 
118
  if not unique_sources:
119
+ yield format_sse({"event": "error", "data": "All search queries returned zero usable sources."}); return
120
 
121
+ # Limit the number of sources to process for very long reports
122
+ sources_to_process = unique_sources[:MAX_SOURCES_TO_PROCESS]
123
+ yield format_sse({"event": "status", "data": f"Found {len(unique_sources)} unique sources. Processing the top {len(sources_to_process)}..."})
124
 
125
+ processing_tasks = [research_and_process_source(session, source) for source in sources_to_process]
126
  consolidated_context, all_sources_used = "", []
 
127
 
128
  for task in asyncio.as_completed(processing_tasks):
129
  content, source_info = await task
130
  if content:
131
  consolidated_context += f"Source: {source_info['link']}\nContent: {content}\n\n---\n\n"
132
  all_sources_used.append(source_info)
 
 
 
133
 
134
  if not consolidated_context.strip():
135
+ yield format_sse({"event": "error", "data": "Failed to gather any research context."}); return
136
 
137
+ # Step 3: Synthesize Long-Form Final Report
138
  yield format_sse({"event": "status", "data": "Synthesizing final report..."})
139
+
140
+ # ***** ENHANCED PROMPT FOR LONGEST POSSIBLE REPORT *****
141
+ report_prompt = f"""
142
+ You are an expert research analyst. Your task is to synthesize the provided context into a long-form, comprehensive, multi-page report on the topic: "{query}".
143
+
144
+ Follow these instructions carefully:
145
+ 1. Write in a professional, academic tone.
146
+ 2. Structure the report with a clear introduction, multiple detailed sections with sub-headings using Markdown, and a concluding summary.
147
+ 3. Elaborate extensively on each point. Use multiple paragraphs for each section to explore the nuances of the topic.
148
+ 4. Base your entire report *only* on the information provided in the context below. Do not use any external knowledge.
149
+ 5. Aim for the most detailed and thorough report possible based on the given material.
150
+
151
+ ## Research Context ##
152
+ {consolidated_context}
153
+ """
154
  report_payload = {"model": LLM_MODEL, "messages": [{"role": "user", "content": report_prompt}], "stream": True}
155
 
156
  async with session.post(LLM_API_URL, headers=LLM_HEADERS, json=report_payload) as response:
 
159
  line_str = line.decode('utf-8').strip()
160
  if line_str.startswith('data:'): line_str = line_str[5:].strip()
161
  if line_str == "[DONE]": break
162
+
163
+ # ***** FIX FOR 'list index out of range' ERROR *****
164
  try:
165
  chunk = json.loads(line_str)
166
+ choices = chunk.get("choices")
167
+ if choices and isinstance(choices, list) and len(choices) > 0:
168
+ content = choices[0].get("delta", {}).get("content")
169
+ if content:
170
+ yield format_sse({"event": "chunk", "data": content})
171
+ except json.JSONDecodeError:
172
+ continue # Ignore malformed lines
173
+
174
+ # Final event with all source data
175
  yield format_sse({"event": "sources", "data": all_sources_used})
176
  except Exception as e:
177
  logger.error(f"A critical error occurred: {e}", exc_info=True)