rkihacker commited on
Commit
4906187
·
verified ·
1 Parent(s): bc2abd9

Update main.py

Browse files
Files changed (1) hide show
  1. main.py +63 -54
main.py CHANGED
@@ -5,6 +5,7 @@ import logging
5
  import random
6
  import re
7
  from typing import AsyncGenerator, Optional, Tuple, List
 
8
 
9
  from fastapi import FastAPI
10
  from fastapi.responses import StreamingResponse
@@ -26,34 +27,15 @@ else:
26
  logger.info("LLM API Key loaded successfully.")
27
 
28
  # --- Constants & Headers ---
29
- # API Provider Constants
30
- SNAPZION_API_URL = "https://search.snapzion.com/get-snippets"
31
  LLM_API_URL = "https://api.typegpt.net/v1/chat/completions"
32
  LLM_MODEL = "gpt-4.1-mini"
33
 
34
- # Automatic Context Sizing (No more fixed limits)
35
  TARGET_TOKEN_LIMIT = 28000
36
  ESTIMATED_CHARS_PER_TOKEN = 4
37
  MAX_CONTEXT_CHAR_LENGTH = TARGET_TOKEN_LIMIT * ESTIMATED_CHARS_PER_TOKEN
38
 
39
- # ***** THE CRITICAL FIX: Full, legitimate headers for the Snapzion API call *****
40
- SNAPZION_HEADERS = {
41
- 'accept': '*/*',
42
- 'accept-language': 'en-US,en;q=0.9',
43
- 'content-type': 'application/json',
44
- 'origin': 'https://search.snapzion.com',
45
- 'priority': 'u=1, i',
46
- 'referer': 'https://search.snapzion.com/docs',
47
- 'sec-ch-ua': '"Chromium";v="140", "Not=A?Brand";v="24", "Google Chrome";v="140"',
48
- 'sec-ch-ua-mobile': '?0',
49
- 'sec-ch-ua-platform': '"Windows"',
50
- 'sec-fetch-dest': 'empty',
51
- 'sec-fetch-mode': 'cors',
52
- 'sec-fetch-site': 'same-origin',
53
- 'user-agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/140.0.0.0 Safari/537.36',
54
- }
55
-
56
- # Real Browser User Agents for SCRAPING ROTATION
57
  USER_AGENTS = [
58
  "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/128.0.0.0 Safari/537.36",
59
  "Mozilla/5.0 (Macintosh; Intel Mac OS X 10_15_7) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/128.0.0.0 Safari/537.36",
@@ -74,46 +56,74 @@ def extract_json_from_llm_response(text: str) -> Optional[list]:
74
 
75
  app = FastAPI(
76
  title="AI Deep Research API",
77
- description="Provides robust, streaming deep research completions.",
78
- version="4.0.0" # Final Production Version
79
  )
80
 
81
  # --- Core Service Functions ---
82
- async def call_snapzion_search(session: aiohttp.ClientSession, query: str) -> List[dict]:
83
- logger.info(f"Searching Snapzion for: '{query}'")
 
 
 
 
 
 
84
  try:
85
- async with session.post(SNAPZION_API_URL, headers=SNAPZION_HEADERS, json={"query": query}, timeout=20) as response:
86
  response.raise_for_status()
87
- data = await response.json()
88
- results = data.get("organic_results", [])
89
- logger.info(f"Found {len(results)} sources for: '{query}'")
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
90
  return results
91
  except Exception as e:
92
- logger.error(f"Snapzion search failed for query '{query}': {e}"); return []
93
 
94
- async def scrape_url(session: aiohttp.ClientSession, url: str) -> str:
95
- if url.lower().endswith('.pdf'): return "Error: PDF"
96
- try:
97
- headers = {'User-Agent': random.choice(USER_AGENTS)}
98
- async with session.get(url, headers=headers, timeout=10, ssl=False) as response:
99
- if response.status != 200: return f"Error: HTTP {response.status}"
100
- return await response.text() # Return full HTML for parsing
101
- except Exception as e:
102
- return f"Error: {e}"
103
-
104
- def parse_html(html: str) -> str:
105
- soup = BeautifulSoup(html, "html.parser")
106
- for tag in soup(['script', 'style', 'nav', 'footer', 'header', 'aside']): tag.decompose()
107
- return " ".join(soup.stripped_strings)
108
 
109
  async def research_and_process_source(session: aiohttp.ClientSession, source: dict) -> Tuple[str, dict]:
110
- html_or_error = await scrape_url(session, source['link'])
111
- if html_or_error.startswith("Error:"):
112
- logger.warning(f"Scraping failed for {source['link']} ({html_or_error}). Falling back to snippet.")
113
- return source.get('snippet', ''), source
114
 
115
- content = parse_html(html_or_error)
116
- return content, source
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
117
 
118
  # --- Streaming Deep Research Logic ---
119
  async def run_deep_research_stream(query: str) -> AsyncGenerator[str, None]:
@@ -133,20 +143,19 @@ async def run_deep_research_stream(query: str) -> AsyncGenerator[str, None]:
133
 
134
  yield format_sse({"event": "plan", "data": sub_questions})
135
 
136
- # Step 2: Conduct Research in Parallel
137
  yield format_sse({"event": "status", "data": f"Searching sources for {len(sub_questions)} topics..."})
138
- search_tasks = [call_snapzion_search(session, sq) for sq in sub_questions]
139
  all_search_results = await asyncio.gather(*search_tasks)
140
 
141
  unique_sources = list({source['link']: source for results in all_search_results for source in results if 'link' in source and 'snippet' in source}.values())
142
 
143
  if not unique_sources:
144
- yield format_sse({"event": "error", "data": "All search queries returned zero usable sources. The search provider might be blocking requests or the topic is too obscure."}); return
145
 
146
  yield format_sse({"event": "status", "data": f"Found {len(unique_sources)} unique sources. Processing..."})
147
 
148
  processing_tasks = [research_and_process_source(session, source) for source in unique_sources]
149
-
150
  consolidated_context, all_sources_used = "", []
151
  successful_scrapes = 0
152
 
 
5
  import random
6
  import re
7
  from typing import AsyncGenerator, Optional, Tuple, List
8
+ from urllib.parse import quote_plus
9
 
10
  from fastapi import FastAPI
11
  from fastapi.responses import StreamingResponse
 
27
  logger.info("LLM API Key loaded successfully.")
28
 
29
  # --- Constants & Headers ---
 
 
30
  LLM_API_URL = "https://api.typegpt.net/v1/chat/completions"
31
  LLM_MODEL = "gpt-4.1-mini"
32
 
33
+ # Automatic Context Sizing
34
  TARGET_TOKEN_LIMIT = 28000
35
  ESTIMATED_CHARS_PER_TOKEN = 4
36
  MAX_CONTEXT_CHAR_LENGTH = TARGET_TOKEN_LIMIT * ESTIMATED_CHARS_PER_TOKEN
37
 
38
+ # Real Browser User Agents for Rotation (Used for both search and scraping)
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
39
  USER_AGENTS = [
40
  "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/128.0.0.0 Safari/537.36",
41
  "Mozilla/5.0 (Macintosh; Intel Mac OS X 10_15_7) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/128.0.0.0 Safari/537.36",
 
56
 
57
  app = FastAPI(
58
  title="AI Deep Research API",
59
+ description="Provides robust, streaming deep research completions using DuckDuckGo Search.",
60
+ version="5.0.0" # Final Production Version with new Search Provider
61
  )
62
 
63
  # --- Core Service Functions ---
64
+
65
+ # ***** THE NEW SEARCH FUNCTION *****
66
+ async def call_duckduckgo_search(session: aiohttp.ClientSession, query: str) -> List[dict]:
67
+ """Performs a search using DuckDuckGo's HTML interface and parses the results."""
68
+ search_url = f"https://html.duckduckgo.com/html/?q={quote_plus(query)}"
69
+ logger.info(f"Searching DuckDuckGo for: '{query}'")
70
+ headers = {'User-Agent': random.choice(USER_AGENTS)}
71
+
72
  try:
73
+ async with session.get(search_url, headers=headers, timeout=15) as response:
74
  response.raise_for_status()
75
+ html = await response.text()
76
+ soup = BeautifulSoup(html, "html.parser")
77
+
78
+ results = []
79
+ # Find all result containers, which have a class 'result'
80
+ for result_container in soup.find_all('div', class_='result'):
81
+ title_tag = result_container.find('a', class_='result__a')
82
+ snippet_tag = result_container.find('a', class_='result__snippet')
83
+
84
+ if title_tag and snippet_tag and title_tag.has_attr('href'):
85
+ # The link in DDG's HTML version is a redirect, so we need to clean it
86
+ raw_link = title_tag['href']
87
+ cleaned_link = re.sub(r'/l/\?kh=-1&uddg=', '', raw_link)
88
+
89
+ results.append({
90
+ 'title': title_tag.get_text(strip=True),
91
+ 'link': cleaned_link,
92
+ 'snippet': snippet_tag.get_text(strip=True)
93
+ })
94
+
95
+ logger.info(f"Found {len(results)} sources from DuckDuckGo for: '{query}'")
96
  return results
97
  except Exception as e:
98
+ logger.error(f"DuckDuckGo search failed for query '{query}': {e}"); return []
99
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
100
 
101
  async def research_and_process_source(session: aiohttp.ClientSession, source: dict) -> Tuple[str, dict]:
102
+ """Scrapes a single source and falls back to its snippet if scraping fails."""
103
+ logger.info(f"Processing source: {source['link']}")
104
+ headers = {'User-Agent': random.choice(USER_AGENTS)}
 
105
 
106
+ try:
107
+ if source['link'].lower().endswith('.pdf'):
108
+ raise ValueError("PDF content cannot be scraped.")
109
+
110
+ async with session.get(source['link'], headers=headers, timeout=10, ssl=False) as response:
111
+ if response.status != 200:
112
+ raise ValueError(f"HTTP status {response.status}")
113
+
114
+ html = await response.text()
115
+ soup = BeautifulSoup(html, "html.parser")
116
+ for tag in soup(['script', 'style', 'nav', 'footer', 'header', 'aside']): tag.decompose()
117
+ content = " ".join(soup.stripped_strings)
118
+
119
+ if not content.strip(): # Check if parsed content is empty
120
+ raise ValueError("Parsed content is empty.")
121
+
122
+ return content, source
123
+
124
+ except Exception as e:
125
+ logger.warning(f"Scraping failed for {source['link']} ({e}). Falling back to snippet.")
126
+ return source.get('snippet', ''), source
127
 
128
  # --- Streaming Deep Research Logic ---
129
  async def run_deep_research_stream(query: str) -> AsyncGenerator[str, None]:
 
143
 
144
  yield format_sse({"event": "plan", "data": sub_questions})
145
 
146
+ # Step 2: Conduct Research in Parallel using DuckDuckGo
147
  yield format_sse({"event": "status", "data": f"Searching sources for {len(sub_questions)} topics..."})
148
+ search_tasks = [call_duckduckgo_search(session, sq) for sq in sub_questions]
149
  all_search_results = await asyncio.gather(*search_tasks)
150
 
151
  unique_sources = list({source['link']: source for results in all_search_results for source in results if 'link' in source and 'snippet' in source}.values())
152
 
153
  if not unique_sources:
154
+ yield format_sse({"event": "error", "data": "All search queries returned zero usable sources from DuckDuckGo."}); return
155
 
156
  yield format_sse({"event": "status", "data": f"Found {len(unique_sources)} unique sources. Processing..."})
157
 
158
  processing_tasks = [research_and_process_source(session, source) for source in unique_sources]
 
159
  consolidated_context, all_sources_used = "", []
160
  successful_scrapes = 0
161