rkihacker commited on
Commit
ffce11c
·
verified ·
1 Parent(s): 64f616b

Update main.py

Browse files
Files changed (1) hide show
  1. main.py +38 -37
main.py CHANGED
@@ -5,10 +5,11 @@ import logging
5
  import random
6
  import re
7
  from typing import AsyncGenerator, Optional, Tuple, List
8
- from urllib.parse import quote_plus
9
 
10
  from fastapi import FastAPI
11
  from fastapi.responses import StreamingResponse
 
12
  from pydantic import BaseModel
13
  from dotenv import load_dotenv
14
  import aiohttp
@@ -29,7 +30,7 @@ else:
29
  # --- Constants & Headers ---
30
  LLM_API_URL = "https://api.typegpt.net/v1/chat/completions"
31
  LLM_MODEL = "gpt-4.1-mini"
32
- MAX_SOURCES_TO_PROCESS = 15 # Increase research depth for longer reports
33
 
34
  # Real Browser User Agents for Rotation
35
  USER_AGENTS = [
@@ -43,6 +44,23 @@ LLM_HEADERS = {"Authorization": f"Bearer {LLM_API_KEY}", "Content-Type": "applic
43
  class DeepResearchRequest(BaseModel):
44
  query: str
45
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
46
  def extract_json_from_llm_response(text: str) -> Optional[list]:
47
  match = re.search(r'\[.*\]', text, re.DOTALL)
48
  if match:
@@ -50,26 +68,29 @@ def extract_json_from_llm_response(text: str) -> Optional[list]:
50
  except json.JSONDecodeError: return None
51
  return None
52
 
53
- app = FastAPI(
54
- title="AI Deep Research API",
55
- description="Provides robust, long-form, streaming deep research completions.",
56
- version="6.0.0" # Final Production Version
57
- )
58
-
59
  # --- Core Service Functions ---
60
  async def call_duckduckgo_search(session: aiohttp.ClientSession, query: str) -> List[dict]:
61
- search_url = f"https://html.duckduckgo.com/html/?q={quote_plus(query)}"
62
  headers = {'User-Agent': random.choice(USER_AGENTS)}
63
  try:
64
  async with session.get(search_url, headers=headers, timeout=15) as response:
65
  response.raise_for_status(); html = await response.text()
66
  soup = BeautifulSoup(html, "html.parser"); results = []
67
  for res in soup.find_all('div', class_='result'):
68
- title_tag, snippet_tag = res.find('a', class_='result__a'), res.find('a', class_='result__snippet')
 
69
  if title_tag and snippet_tag and 'href' in title_tag.attrs:
70
- cleaned_link = re.sub(r'/l/\?kh=-1&uddg=', '', title_tag['href'])
71
- results.append({'title': title_tag.text, 'link': cleaned_link, 'snippet': snippet_tag.text})
72
- logger.info(f"Found {len(results)} sources from DuckDuckGo for: '{query}'")
 
 
 
 
 
 
 
 
73
  return results
74
  except Exception as e:
75
  logger.error(f"DuckDuckGo search failed for query '{query}': {e}"); return []
@@ -77,6 +98,7 @@ async def call_duckduckgo_search(session: aiohttp.ClientSession, query: str) ->
77
  async def research_and_process_source(session: aiohttp.ClientSession, source: dict) -> Tuple[str, dict]:
78
  headers = {'User-Agent': random.choice(USER_AGENTS)}
79
  try:
 
80
  if source['link'].lower().endswith('.pdf'): raise ValueError("PDF content")
81
  async with session.get(source['link'], headers=headers, timeout=10, ssl=False) as response:
82
  if response.status != 200: raise ValueError(f"HTTP status {response.status}")
@@ -95,7 +117,6 @@ async def run_deep_research_stream(query: str) -> AsyncGenerator[str, None]:
95
  def format_sse(data: dict) -> str: return f"data: {json.dumps(data)}\n\n"
96
  try:
97
  async with aiohttp.ClientSession() as session:
98
- # Step 1: Generate Research Plan
99
  yield format_sse({"event": "status", "data": "Generating research plan..."})
100
  plan_prompt = {"model": LLM_MODEL, "messages": [{"role": "user", "content": f"Generate 3-4 key sub-questions for a research report on '{query}'. Your response MUST be ONLY the raw JSON array. Example: [\"Question 1?\"]"}]}
101
  try:
@@ -108,17 +129,14 @@ async def run_deep_research_stream(query: str) -> AsyncGenerator[str, None]:
108
 
109
  yield format_sse({"event": "plan", "data": sub_questions})
110
 
111
- # Step 2: Conduct Deep Research
112
  yield format_sse({"event": "status", "data": f"Searching sources for {len(sub_questions)} topics..."})
113
  search_tasks = [call_duckduckgo_search(session, sq) for sq in sub_questions]
114
  all_search_results = await asyncio.gather(*search_tasks)
115
-
116
  unique_sources = list({source['link']: source for results in all_search_results for source in results}.values())
117
 
118
  if not unique_sources:
119
  yield format_sse({"event": "error", "data": "All search queries returned zero usable sources."}); return
120
 
121
- # Limit the number of sources to process for very long reports
122
  sources_to_process = unique_sources[:MAX_SOURCES_TO_PROCESS]
123
  yield format_sse({"event": "status", "data": f"Found {len(unique_sources)} unique sources. Processing the top {len(sources_to_process)}..."})
124
 
@@ -134,23 +152,8 @@ async def run_deep_research_stream(query: str) -> AsyncGenerator[str, None]:
134
  if not consolidated_context.strip():
135
  yield format_sse({"event": "error", "data": "Failed to gather any research context."}); return
136
 
137
- # Step 3: Synthesize Long-Form Final Report
138
  yield format_sse({"event": "status", "data": "Synthesizing final report..."})
139
-
140
- # ***** ENHANCED PROMPT FOR LONGEST POSSIBLE REPORT *****
141
- report_prompt = f"""
142
- You are an expert research analyst. Your task is to synthesize the provided context into a long-form, comprehensive, multi-page report on the topic: "{query}".
143
-
144
- Follow these instructions carefully:
145
- 1. Write in a professional, academic tone.
146
- 2. Structure the report with a clear introduction, multiple detailed sections with sub-headings using Markdown, and a concluding summary.
147
- 3. Elaborate extensively on each point. Use multiple paragraphs for each section to explore the nuances of the topic.
148
- 4. Base your entire report *only* on the information provided in the context below. Do not use any external knowledge.
149
- 5. Aim for the most detailed and thorough report possible based on the given material.
150
-
151
- ## Research Context ##
152
- {consolidated_context}
153
- """
154
  report_payload = {"model": LLM_MODEL, "messages": [{"role": "user", "content": report_prompt}], "stream": True}
155
 
156
  async with session.post(LLM_API_URL, headers=LLM_HEADERS, json=report_payload) as response:
@@ -160,7 +163,7 @@ Follow these instructions carefully:
160
  if line_str.startswith('data:'): line_str = line_str[5:].strip()
161
  if line_str == "[DONE]": break
162
 
163
- # ***** FIX FOR 'list index out of range' ERROR *****
164
  try:
165
  chunk = json.loads(line_str)
166
  choices = chunk.get("choices")
@@ -169,9 +172,8 @@ Follow these instructions carefully:
169
  if content:
170
  yield format_sse({"event": "chunk", "data": content})
171
  except json.JSONDecodeError:
172
- continue # Ignore malformed lines
173
 
174
- # Final event with all source data
175
  yield format_sse({"event": "sources", "data": all_sources_used})
176
  except Exception as e:
177
  logger.error(f"A critical error occurred: {e}", exc_info=True)
@@ -179,7 +181,6 @@ Follow these instructions carefully:
179
  finally:
180
  yield format_sse({"event": "done", "data": "Deep research complete."})
181
 
182
- # --- API Endpoints ---
183
  @app.post("/v1/deepresearch/completions")
184
  async def deep_research_endpoint(request: DeepResearchRequest):
185
  return StreamingResponse(run_deep_research_stream(request.query), media_type="text/event-stream")
 
5
  import random
6
  import re
7
  from typing import AsyncGenerator, Optional, Tuple, List
8
+ from urllib.parse import unquote
9
 
10
  from fastapi import FastAPI
11
  from fastapi.responses import StreamingResponse
12
+ from fastapi.middleware.cors import CORSMiddleware
13
  from pydantic import BaseModel
14
  from dotenv import load_dotenv
15
  import aiohttp
 
30
  # --- Constants & Headers ---
31
  LLM_API_URL = "https://api.typegpt.net/v1/chat/completions"
32
  LLM_MODEL = "gpt-4.1-mini"
33
+ MAX_SOURCES_TO_PROCESS = 15
34
 
35
  # Real Browser User Agents for Rotation
36
  USER_AGENTS = [
 
44
  class DeepResearchRequest(BaseModel):
45
  query: str
46
 
47
+ app = FastAPI(
48
+ title="AI Deep Research API",
49
+ description="Provides robust, long-form, streaming deep research completions.",
50
+ version="7.0.0" # Final Production Version
51
+ )
52
+
53
+ # ***** CHANGE 1: Enable CORS for all origins *****
54
+ app.add_middleware(
55
+ CORSMiddleware,
56
+ allow_origins=["*"], # Allows all origins
57
+ allow_credentials=True,
58
+ allow_methods=["*"], # Allows all methods
59
+ allow_headers=["*"], # Allows all headers
60
+ )
61
+ logger.info("CORS middleware enabled for all origins.")
62
+
63
+ # --- Helper Functions ---
64
  def extract_json_from_llm_response(text: str) -> Optional[list]:
65
  match = re.search(r'\[.*\]', text, re.DOTALL)
66
  if match:
 
68
  except json.JSONDecodeError: return None
69
  return None
70
 
 
 
 
 
 
 
71
  # --- Core Service Functions ---
72
  async def call_duckduckgo_search(session: aiohttp.ClientSession, query: str) -> List[dict]:
73
+ search_url = f"https://html.duckduckgo.com/html/?q={query.replace(' ', '+')}"
74
  headers = {'User-Agent': random.choice(USER_AGENTS)}
75
  try:
76
  async with session.get(search_url, headers=headers, timeout=15) as response:
77
  response.raise_for_status(); html = await response.text()
78
  soup = BeautifulSoup(html, "html.parser"); results = []
79
  for res in soup.find_all('div', class_='result'):
80
+ title_tag = res.find('a', class_='result__a')
81
+ snippet_tag = res.find('a', class_='result__snippet')
82
  if title_tag and snippet_tag and 'href' in title_tag.attrs:
83
+ # ***** CHANGE 2: The critical fix for scraping. Decode the real URL. *****
84
+ try:
85
+ raw_link = title_tag['href']
86
+ # The real URL is percent-encoded in the 'uddg' parameter
87
+ actual_url = unquote(raw_link.split('uddg=')[1])
88
+ if actual_url.startswith("http"):
89
+ results.append({'title': title_tag.text, 'link': actual_url, 'snippet': snippet_tag.text})
90
+ except IndexError:
91
+ # This link format is unexpected, skip it
92
+ continue
93
+ logger.info(f"Found {len(results)} valid sources from DuckDuckGo for: '{query}'")
94
  return results
95
  except Exception as e:
96
  logger.error(f"DuckDuckGo search failed for query '{query}': {e}"); return []
 
98
  async def research_and_process_source(session: aiohttp.ClientSession, source: dict) -> Tuple[str, dict]:
99
  headers = {'User-Agent': random.choice(USER_AGENTS)}
100
  try:
101
+ logger.info(f"Scraping: {source['link']}")
102
  if source['link'].lower().endswith('.pdf'): raise ValueError("PDF content")
103
  async with session.get(source['link'], headers=headers, timeout=10, ssl=False) as response:
104
  if response.status != 200: raise ValueError(f"HTTP status {response.status}")
 
117
  def format_sse(data: dict) -> str: return f"data: {json.dumps(data)}\n\n"
118
  try:
119
  async with aiohttp.ClientSession() as session:
 
120
  yield format_sse({"event": "status", "data": "Generating research plan..."})
121
  plan_prompt = {"model": LLM_MODEL, "messages": [{"role": "user", "content": f"Generate 3-4 key sub-questions for a research report on '{query}'. Your response MUST be ONLY the raw JSON array. Example: [\"Question 1?\"]"}]}
122
  try:
 
129
 
130
  yield format_sse({"event": "plan", "data": sub_questions})
131
 
 
132
  yield format_sse({"event": "status", "data": f"Searching sources for {len(sub_questions)} topics..."})
133
  search_tasks = [call_duckduckgo_search(session, sq) for sq in sub_questions]
134
  all_search_results = await asyncio.gather(*search_tasks)
 
135
  unique_sources = list({source['link']: source for results in all_search_results for source in results}.values())
136
 
137
  if not unique_sources:
138
  yield format_sse({"event": "error", "data": "All search queries returned zero usable sources."}); return
139
 
 
140
  sources_to_process = unique_sources[:MAX_SOURCES_TO_PROCESS]
141
  yield format_sse({"event": "status", "data": f"Found {len(unique_sources)} unique sources. Processing the top {len(sources_to_process)}..."})
142
 
 
152
  if not consolidated_context.strip():
153
  yield format_sse({"event": "error", "data": "Failed to gather any research context."}); return
154
 
 
155
  yield format_sse({"event": "status", "data": "Synthesizing final report..."})
156
+ report_prompt = f'Synthesize the provided context into a long-form, comprehensive, multi-page report on "{query}". Use markdown. Elaborate extensively on each point. Base your entire report ONLY on the provided context.\n\n## Research Context ##\n{consolidated_context}'
 
 
 
 
 
 
 
 
 
 
 
 
 
 
157
  report_payload = {"model": LLM_MODEL, "messages": [{"role": "user", "content": report_prompt}], "stream": True}
158
 
159
  async with session.post(LLM_API_URL, headers=LLM_HEADERS, json=report_payload) as response:
 
163
  if line_str.startswith('data:'): line_str = line_str[5:].strip()
164
  if line_str == "[DONE]": break
165
 
166
+ # ***** CHANGE 3: The definitive fix for the 'list index out of range' error *****
167
  try:
168
  chunk = json.loads(line_str)
169
  choices = chunk.get("choices")
 
172
  if content:
173
  yield format_sse({"event": "chunk", "data": content})
174
  except json.JSONDecodeError:
175
+ continue
176
 
 
177
  yield format_sse({"event": "sources", "data": all_sources_used})
178
  except Exception as e:
179
  logger.error(f"A critical error occurred: {e}", exc_info=True)
 
181
  finally:
182
  yield format_sse({"event": "done", "data": "Deep research complete."})
183
 
 
184
  @app.post("/v1/deepresearch/completions")
185
  async def deep_research_endpoint(request: DeepResearchRequest):
186
  return StreamingResponse(run_deep_research_stream(request.query), media_type="text/event-stream")