rkihacker commited on
Commit
3c9a1a6
·
verified ·
1 Parent(s): ffce11c

Update main.py

Browse files
Files changed (1) hide show
  1. main.py +25 -40
main.py CHANGED
@@ -5,7 +5,6 @@ import logging
5
  import random
6
  import re
7
  from typing import AsyncGenerator, Optional, Tuple, List
8
- from urllib.parse import unquote
9
 
10
  from fastapi import FastAPI
11
  from fastapi.responses import StreamingResponse
@@ -28,11 +27,12 @@ else:
28
  logger.info("LLM API Key loaded successfully.")
29
 
30
  # --- Constants & Headers ---
 
31
  LLM_API_URL = "https://api.typegpt.net/v1/chat/completions"
32
  LLM_MODEL = "gpt-4.1-mini"
33
  MAX_SOURCES_TO_PROCESS = 15
34
 
35
- # Real Browser User Agents for Rotation
36
  USER_AGENTS = [
37
  "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/128.0.0.0 Safari/537.36",
38
  "Mozilla/5.0 (Macintosh; Intel Mac OS X 10_15_7) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/128.0.0.0 Safari/537.36",
@@ -46,19 +46,12 @@ class DeepResearchRequest(BaseModel):
46
 
47
  app = FastAPI(
48
  title="AI Deep Research API",
49
- description="Provides robust, long-form, streaming deep research completions.",
50
- version="7.0.0" # Final Production Version
51
  )
52
 
53
- # ***** CHANGE 1: Enable CORS for all origins *****
54
- app.add_middleware(
55
- CORSMiddleware,
56
- allow_origins=["*"], # Allows all origins
57
- allow_credentials=True,
58
- allow_methods=["*"], # Allows all methods
59
- allow_headers=["*"], # Allows all headers
60
- )
61
- logger.info("CORS middleware enabled for all origins.")
62
 
63
  # --- Helper Functions ---
64
  def extract_json_from_llm_response(text: str) -> Optional[list]:
@@ -69,31 +62,26 @@ def extract_json_from_llm_response(text: str) -> Optional[list]:
69
  return None
70
 
71
  # --- Core Service Functions ---
72
- async def call_duckduckgo_search(session: aiohttp.ClientSession, query: str) -> List[dict]:
73
- search_url = f"https://html.duckduckgo.com/html/?q={query.replace(' ', '+')}"
74
- headers = {'User-Agent': random.choice(USER_AGENTS)}
 
75
  try:
76
- async with session.get(search_url, headers=headers, timeout=15) as response:
77
- response.raise_for_status(); html = await response.text()
78
- soup = BeautifulSoup(html, "html.parser"); results = []
79
- for res in soup.find_all('div', class_='result'):
80
- title_tag = res.find('a', class_='result__a')
81
- snippet_tag = res.find('a', class_='result__snippet')
82
- if title_tag and snippet_tag and 'href' in title_tag.attrs:
83
- # ***** CHANGE 2: The critical fix for scraping. Decode the real URL. *****
84
- try:
85
- raw_link = title_tag['href']
86
- # The real URL is percent-encoded in the 'uddg' parameter
87
- actual_url = unquote(raw_link.split('uddg=')[1])
88
- if actual_url.startswith("http"):
89
- results.append({'title': title_tag.text, 'link': actual_url, 'snippet': snippet_tag.text})
90
- except IndexError:
91
- # This link format is unexpected, skip it
92
- continue
93
- logger.info(f"Found {len(results)} valid sources from DuckDuckGo for: '{query}'")
94
  return results
95
  except Exception as e:
96
- logger.error(f"DuckDuckGo search failed for query '{query}': {e}"); return []
97
 
98
  async def research_and_process_source(session: aiohttp.ClientSession, source: dict) -> Tuple[str, dict]:
99
  headers = {'User-Agent': random.choice(USER_AGENTS)}
@@ -130,7 +118,7 @@ async def run_deep_research_stream(query: str) -> AsyncGenerator[str, None]:
130
  yield format_sse({"event": "plan", "data": sub_questions})
131
 
132
  yield format_sse({"event": "status", "data": f"Searching sources for {len(sub_questions)} topics..."})
133
- search_tasks = [call_duckduckgo_search(session, sq) for sq in sub_questions]
134
  all_search_results = await asyncio.gather(*search_tasks)
135
  unique_sources = list({source['link']: source for results in all_search_results for source in results}.values())
136
 
@@ -162,8 +150,6 @@ async def run_deep_research_stream(query: str) -> AsyncGenerator[str, None]:
162
  line_str = line.decode('utf-8').strip()
163
  if line_str.startswith('data:'): line_str = line_str[5:].strip()
164
  if line_str == "[DONE]": break
165
-
166
- # ***** CHANGE 3: The definitive fix for the 'list index out of range' error *****
167
  try:
168
  chunk = json.loads(line_str)
169
  choices = chunk.get("choices")
@@ -171,8 +157,7 @@ async def run_deep_research_stream(query: str) -> AsyncGenerator[str, None]:
171
  content = choices[0].get("delta", {}).get("content")
172
  if content:
173
  yield format_sse({"event": "chunk", "data": content})
174
- except json.JSONDecodeError:
175
- continue
176
 
177
  yield format_sse({"event": "sources", "data": all_sources_used})
178
  except Exception as e:
 
5
  import random
6
  import re
7
  from typing import AsyncGenerator, Optional, Tuple, List
 
8
 
9
  from fastapi import FastAPI
10
  from fastapi.responses import StreamingResponse
 
27
  logger.info("LLM API Key loaded successfully.")
28
 
29
  # --- Constants & Headers ---
30
+ SEARCH_API_URL = "https://search.privateinstance.com/api/text" # The new search provider
31
  LLM_API_URL = "https://api.typegpt.net/v1/chat/completions"
32
  LLM_MODEL = "gpt-4.1-mini"
33
  MAX_SOURCES_TO_PROCESS = 15
34
 
35
+ # Real Browser User Agents for SCRAPING
36
  USER_AGENTS = [
37
  "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/128.0.0.0 Safari/537.36",
38
  "Mozilla/5.0 (Macintosh; Intel Mac OS X 10_15_7) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/128.0.0.0 Safari/537.36",
 
46
 
47
  app = FastAPI(
48
  title="AI Deep Research API",
49
+ description="Provides robust, long-form, streaming deep research completions using the PrivateInstance Search API.",
50
+ version="8.0.0" # Final Production Version with PrivateInstance API
51
  )
52
 
53
+ # Enable CORS for all origins
54
+ app.add_middleware(CORSMiddleware, allow_origins=["*"], allow_credentials=True, allow_methods=["*"], allow_headers=["*"])
 
 
 
 
 
 
 
55
 
56
  # --- Helper Functions ---
57
  def extract_json_from_llm_response(text: str) -> Optional[list]:
 
62
  return None
63
 
64
  # --- Core Service Functions ---
65
+ async def call_privateinstance_search(session: aiohttp.ClientSession, query: str) -> List[dict]:
66
+ """Performs a search using the PrivateInstance Search API."""
67
+ params = {'q': query, 'max_results': 10}
68
+ logger.info(f"Searching PrivateInstance API for: '{query}'")
69
  try:
70
+ async with session.get(SEARCH_API_URL, params=params, timeout=15) as response:
71
+ response.raise_for_status()
72
+ data = await response.json()
73
+ # The API might return results in a list directly or under a 'results' key.
74
+ raw_results = data if isinstance(data, list) else data.get('results', [])
75
+
76
+ # Map the API's response keys to our internal format
77
+ results = [
78
+ {'title': r.get('title'), 'link': r.get('url'), 'snippet': r.get('description')}
79
+ for r in raw_results if r.get('url') and r.get('title') and r.get('description')
80
+ ]
81
+ logger.info(f"Found {len(results)} sources from PrivateInstance for: '{query}'")
 
 
 
 
 
 
82
  return results
83
  except Exception as e:
84
+ logger.error(f"PrivateInstance search failed for query '{query}': {e}"); return []
85
 
86
  async def research_and_process_source(session: aiohttp.ClientSession, source: dict) -> Tuple[str, dict]:
87
  headers = {'User-Agent': random.choice(USER_AGENTS)}
 
118
  yield format_sse({"event": "plan", "data": sub_questions})
119
 
120
  yield format_sse({"event": "status", "data": f"Searching sources for {len(sub_questions)} topics..."})
121
+ search_tasks = [call_privateinstance_search(session, sq) for sq in sub_questions]
122
  all_search_results = await asyncio.gather(*search_tasks)
123
  unique_sources = list({source['link']: source for results in all_search_results for source in results}.values())
124
 
 
150
  line_str = line.decode('utf-8').strip()
151
  if line_str.startswith('data:'): line_str = line_str[5:].strip()
152
  if line_str == "[DONE]": break
 
 
153
  try:
154
  chunk = json.loads(line_str)
155
  choices = chunk.get("choices")
 
157
  content = choices[0].get("delta", {}).get("content")
158
  if content:
159
  yield format_sse({"event": "chunk", "data": content})
160
+ except json.JSONDecodeError: continue
 
161
 
162
  yield format_sse({"event": "sources", "data": all_sources_used})
163
  except Exception as e: