rkihacker commited on
Commit
b7afcad
·
verified ·
1 Parent(s): 427157a

Update main.py

Browse files
Files changed (1) hide show
  1. main.py +30 -48
main.py CHANGED
@@ -33,11 +33,10 @@ else:
33
  # --- Constants & Headers ---
34
  LLM_API_URL = "https://api.typegpt.net/v1/chat/completions"
35
  LLM_MODEL = "meta-llama/Llama-4-Maverick-17B-128E-Instruct-FP8"
36
- MAX_SOURCES_TO_PROCESS = 6 # Reduced to stay within time limits with real requests
37
- MAX_CONCURRENT_REQUESTS = 3 # Be conservative with real websites
38
  RESEARCH_TIMEOUT = 180 # 3 minutes maximum
39
- REQUEST_DELAY = 2.0 # Longer delay between requests to be more polite
40
- USER_AGENT_ROTATION = True
41
 
42
  # Initialize fake user agent generator
43
  try:
@@ -65,7 +64,7 @@ class DeepResearchRequest(BaseModel):
65
  app = FastAPI(
66
  title="AI Deep Research API",
67
  description="Provides robust, long-form, streaming deep research completions using real web searches.",
68
- version="2.1.0" # Updated version
69
  )
70
  app.add_middleware(
71
  CORSMiddleware,
@@ -87,18 +86,21 @@ def extract_json_from_llm_response(text: str) -> Optional[list]:
87
 
88
  async def get_real_user_agent() -> str:
89
  """Get a realistic user agent string."""
90
- if USER_AGENT_ROTATION:
91
- return ua.random()
92
- return "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/128.0.0.0 Safari/537.36"
 
 
 
93
 
94
  async def check_robots_txt(url: str) -> bool:
95
  """Check if scraping is allowed by robots.txt."""
96
  try:
97
- domain = re.search(r'https?://([^/]+)', url)
98
- if not domain:
99
  return False
100
 
101
- domain = domain.group(1)
102
  robots_url = f"https://{domain}/robots.txt"
103
 
104
  async with aiohttp.ClientSession() as session:
@@ -106,22 +108,20 @@ async def check_robots_txt(url: str) -> bool:
106
  async with session.get(robots_url, headers=headers, timeout=5) as response:
107
  if response.status == 200:
108
  robots = await response.text()
109
- # Simple check - disallow all if present
110
  if "Disallow: /" in robots:
111
  return False
112
- # Check for specific disallow rules for our path
113
  path = re.sub(r'https?://[^/]+', '', url)
114
- if f"Disallow: {path}" in robots:
115
  return False
116
  return True
117
  except Exception as e:
118
  logging.warning(f"Could not check robots.txt for {url}: {e}")
119
- return False # Default to not scraping if we can't check
120
 
121
  async def fetch_search_results(query: str, max_results: int = 5) -> List[dict]:
122
  """
123
  Perform a real search using DuckDuckGo's HTML interface.
124
- Note: This may break if DuckDuckGo changes their HTML structure.
125
  """
126
  try:
127
  search_url = f"https://html.duckduckgo.com/html/?q={quote_plus(query)}"
@@ -144,20 +144,18 @@ async def fetch_search_results(query: str, max_results: int = 5) -> List[dict]:
144
 
145
  results = []
146
  # Updated selectors for DuckDuckGo's current HTML structure
147
- for result in soup.select('.result')[:max_results]:
148
  try:
149
  title_elem = result.select_one('.result__title .result__a')
150
  link_elem = title_elem if title_elem else result.select_one('a')
151
  snippet_elem = result.select_one('.result__snippet')
152
 
153
  if title_elem and link_elem and snippet_elem:
154
- # Clean up the URL
155
  link = link_elem['href']
156
  if link.startswith('/l/'):
157
- # DuckDuckGo returns relative links that redirect
158
- # We need to follow these to get the actual URL
159
  try:
160
- redirect_url = f"https://duckduckgo.com{link}"
161
  async with session.get(redirect_url, headers=headers, timeout=5, allow_redirects=False) as redirect_resp:
162
  if redirect_resp.status == 302:
163
  link = redirect_resp.headers.get('Location', link)
@@ -176,7 +174,6 @@ async def fetch_search_results(query: str, max_results: int = 5) -> List[dict]:
176
 
177
  logging.info(f"Found {len(results)} real search results for '{query}'")
178
  return results
179
-
180
  except Exception as e:
181
  logging.error(f"Real search failed: {e}")
182
  return []
@@ -223,7 +220,6 @@ async def process_web_source(session: aiohttp.ClientSession, source: dict, timeo
223
  tag.decompose()
224
 
225
  # Try to find main content by common patterns
226
- main_content = None
227
  selectors_to_try = [
228
  'main',
229
  'article',
@@ -236,6 +232,7 @@ async def process_web_source(session: aiohttp.ClientSession, source: dict, timeo
236
  '#content'
237
  ]
238
 
 
239
  for selector in selectors_to_try:
240
  main_content = soup.select_one(selector)
241
  if main_content:
@@ -244,10 +241,8 @@ async def process_web_source(session: aiohttp.ClientSession, source: dict, timeo
244
  if not main_content:
245
  # If no main content found, try to find the largest text block
246
  all_elements = soup.find_all()
247
- # Filter out elements that are likely not main content
248
  candidates = [el for el in all_elements if el.name not in ['script', 'style', 'nav', 'footer', 'header']]
249
  if candidates:
250
- # Sort by text length
251
  candidates.sort(key=lambda x: len(x.get_text()), reverse=True)
252
  main_content = candidates[0] if candidates else soup
253
 
@@ -270,7 +265,7 @@ async def process_web_source(session: aiohttp.ClientSession, source: dict, timeo
270
  content = " ".join(soup.stripped_strings)
271
  content = re.sub(r'\s+', ' ', content).strip()
272
 
273
- if len(content.split()) < 30: # Minimum threshold for useful content
274
  logging.warning(f"Very little content extracted from {source['link']}")
275
  return source.get('snippet', ''), source_info
276
 
@@ -312,25 +307,22 @@ async def generate_research_plan(query: str, session: aiohttp.ClientSession) ->
312
  content = result['choices'][0]['message']['content']
313
  sub_questions = extract_json_from_llm_response(content)
314
  if sub_questions and isinstance(sub_questions, list):
315
- # Clean up the questions
316
  cleaned = []
317
  for q in sub_questions:
318
  if isinstance(q, str) and q.strip():
319
  cleaned_q = re.sub(r'^[^a-zA-Z0-9]*|[^a-zA-Z0-9]*$', '', q)
320
  if cleaned_q:
321
  cleaned.append(cleaned_q)
322
- return cleaned[:5] # Limit to 5 questions max
323
 
324
  # Fallback if we couldn't get good questions from LLM
325
- default_questions = [
326
  f"What is {query} and its key characteristics?",
327
  f"What are the main aspects or components of {query}?",
328
  f"What is the history and development of {query}?",
329
  f"What are the current trends or recent developments in {query}?",
330
  f"What are common challenges or controversies related to {query}?"
331
  ]
332
- return default_questions[:4]
333
-
334
  except Exception as e:
335
  logging.error(f"Failed to generate research plan: {e}")
336
  return [
@@ -416,7 +408,6 @@ async def run_deep_research_stream(query: str) -> AsyncGenerator[str, None]:
416
  "data": f"Found {len(unique_sources)} unique sources to process."
417
  })
418
 
419
- # If we have no sources, return early
420
  if not unique_sources:
421
  yield format_sse({
422
  "event": "error",
@@ -439,7 +430,7 @@ async def run_deep_research_stream(query: str) -> AsyncGenerator[str, None]:
439
  for i, source in enumerate(unique_sources):
440
  # Check if we're running out of time
441
  elapsed = time.time() - start_time
442
- if elapsed > RESEARCH_TIMEOUT * 0.7: # Leave 30% of time for synthesis
443
  yield format_sse({
444
  "event": "status",
445
  "data": f"Approaching time limit, stopping source processing at {i}/{len(unique_sources)}"
@@ -448,12 +439,11 @@ async def run_deep_research_stream(query: str) -> AsyncGenerator[str, None]:
448
 
449
  # Add delay between processing each source to be polite
450
  if i > 0:
451
- await asyncio.sleep(REQUEST_DELAY * 0.5) # Shorter delay between same-domain requests
452
 
453
  task = asyncio.create_task(process_with_semaphore(source))
454
  processing_tasks.append(task)
455
 
456
- # Yield progress updates periodically
457
  if (i + 1) % 2 == 0 or (i + 1) == len(unique_sources):
458
  yield format_sse({
459
  "event": "status",
@@ -465,11 +455,10 @@ async def run_deep_research_stream(query: str) -> AsyncGenerator[str, None]:
465
  processed_sources += 1
466
  content, source_info = await future
467
  if content and content.strip():
468
- # Add source content to our consolidated context
469
  consolidated_context += f"Source: {source_info['link']}\nContent: {content}\n\n---\n\n"
470
  all_sources_used.append(source_info)
471
  successful_sources += 1
472
- total_tokens += len(content.split()) # Rough token count
473
  else:
474
  processing_errors += 1
475
 
@@ -480,14 +469,13 @@ async def run_deep_research_stream(query: str) -> AsyncGenerator[str, None]:
480
  })
481
  return
482
 
483
- # Step 4: Synthesize report with improved prompt
484
  time_remaining = max(0, RESEARCH_TIMEOUT - (time.time() - start_time))
485
  yield format_sse({
486
  "event": "status",
487
  "data": f"Synthesizing report with content from {successful_sources} sources..."
488
  })
489
 
490
- # Estimate how many tokens we can generate based on remaining time
491
  max_output_tokens = min(1500, int(time_remaining * 5))
492
 
493
  report_prompt = f"""Compose a comprehensive research report on "{query}".
@@ -496,16 +484,15 @@ async def run_deep_research_stream(query: str) -> AsyncGenerator[str, None]:
496
 
497
  Key requirements:
498
  1. Start with an introduction that explains what {query} is and why it's important
499
- 2. Include well-organized sections with clear headings based on the research questions
500
  3. Cite specific information from sources where appropriate
501
  4. End with a conclusion that summarizes key findings and insights
502
  5. Keep the report concise but comprehensive
503
 
504
  Available information (summarized from {successful_sources} sources):
505
- {consolidated_context[:18000]} # Increased context size but still limited
506
 
507
- Generate a report that is approximately {max_output_tokens//4} words long (about {max_output_tokens//4//200} paragraphs).
508
- Focus on the most important and relevant information.
509
  """
510
 
511
  report_payload = {
@@ -515,11 +502,9 @@ async def run_deep_research_stream(query: str) -> AsyncGenerator[str, None]:
515
  "max_tokens": max_output_tokens
516
  }
517
 
518
- # Stream the report generation
519
  async with session.post(LLM_API_URL, headers=LLM_HEADERS, json=report_payload) as response:
520
  response.raise_for_status()
521
  async for line in response.content:
522
- # Check if we're running out of time
523
  if time.time() - start_time > RESEARCH_TIMEOUT:
524
  yield format_sse({
525
  "event": "warning",
@@ -539,13 +524,10 @@ async def run_deep_research_stream(query: str) -> AsyncGenerator[str, None]:
539
  content = choices[0].get("delta", {}).get("content")
540
  if content:
541
  yield format_sse({"event": "chunk", "data": content})
542
- except json.JSONDecodeError:
543
- continue
544
  except Exception as e:
545
  logging.warning(f"Error processing stream chunk: {e}")
546
  continue
547
 
548
- # Final status update
549
  duration = time.time() - start_time
550
  stats = {
551
  "total_time_seconds": round(duration),
 
33
  # --- Constants & Headers ---
34
  LLM_API_URL = "https://api.typegpt.net/v1/chat/completions"
35
  LLM_MODEL = "meta-llama/Llama-4-Maverick-17B-128E-Instruct-FP8"
36
+ MAX_SOURCES_TO_PROCESS = 6
37
+ MAX_CONCURRENT_REQUESTS = 3
38
  RESEARCH_TIMEOUT = 180 # 3 minutes maximum
39
+ REQUEST_DELAY = 2.0
 
40
 
41
  # Initialize fake user agent generator
42
  try:
 
64
  app = FastAPI(
65
  title="AI Deep Research API",
66
  description="Provides robust, long-form, streaming deep research completions using real web searches.",
67
+ version="2.1.0"
68
  )
69
  app.add_middleware(
70
  CORSMiddleware,
 
86
 
87
  async def get_real_user_agent() -> str:
88
  """Get a realistic user agent string."""
89
+ try:
90
+ if isinstance(ua, UserAgent):
91
+ return ua.random
92
+ return ua.random() # For our fallback class
93
+ except:
94
+ return "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/128.0.0.0 Safari/537.36"
95
 
96
  async def check_robots_txt(url: str) -> bool:
97
  """Check if scraping is allowed by robots.txt."""
98
  try:
99
+ domain_match = re.search(r'https?://([^/]+)', url)
100
+ if not domain_match:
101
  return False
102
 
103
+ domain = domain_match.group(1)
104
  robots_url = f"https://{domain}/robots.txt"
105
 
106
  async with aiohttp.ClientSession() as session:
 
108
  async with session.get(robots_url, headers=headers, timeout=5) as response:
109
  if response.status == 200:
110
  robots = await response.text()
 
111
  if "Disallow: /" in robots:
112
  return False
113
+ # Check for specific path disallows
114
  path = re.sub(r'https?://[^/]+', '', url)
115
+ if any(f"Disallow: {p}" in robots for p in [path, path.rstrip('/') + '/']):
116
  return False
117
  return True
118
  except Exception as e:
119
  logging.warning(f"Could not check robots.txt for {url}: {e}")
120
+ return False
121
 
122
  async def fetch_search_results(query: str, max_results: int = 5) -> List[dict]:
123
  """
124
  Perform a real search using DuckDuckGo's HTML interface.
 
125
  """
126
  try:
127
  search_url = f"https://html.duckduckgo.com/html/?q={quote_plus(query)}"
 
144
 
145
  results = []
146
  # Updated selectors for DuckDuckGo's current HTML structure
147
+ for result in soup.select('.result__body')[:max_results]:
148
  try:
149
  title_elem = result.select_one('.result__title .result__a')
150
  link_elem = title_elem if title_elem else result.select_one('a')
151
  snippet_elem = result.select_one('.result__snippet')
152
 
153
  if title_elem and link_elem and snippet_elem:
154
+ # Handle DuckDuckGo's redirect URLs
155
  link = link_elem['href']
156
  if link.startswith('/l/'):
157
+ redirect_url = f"https://duckduckgo.com{link}"
 
158
  try:
 
159
  async with session.get(redirect_url, headers=headers, timeout=5, allow_redirects=False) as redirect_resp:
160
  if redirect_resp.status == 302:
161
  link = redirect_resp.headers.get('Location', link)
 
174
 
175
  logging.info(f"Found {len(results)} real search results for '{query}'")
176
  return results
 
177
  except Exception as e:
178
  logging.error(f"Real search failed: {e}")
179
  return []
 
220
  tag.decompose()
221
 
222
  # Try to find main content by common patterns
 
223
  selectors_to_try = [
224
  'main',
225
  'article',
 
232
  '#content'
233
  ]
234
 
235
+ main_content = None
236
  for selector in selectors_to_try:
237
  main_content = soup.select_one(selector)
238
  if main_content:
 
241
  if not main_content:
242
  # If no main content found, try to find the largest text block
243
  all_elements = soup.find_all()
 
244
  candidates = [el for el in all_elements if el.name not in ['script', 'style', 'nav', 'footer', 'header']]
245
  if candidates:
 
246
  candidates.sort(key=lambda x: len(x.get_text()), reverse=True)
247
  main_content = candidates[0] if candidates else soup
248
 
 
265
  content = " ".join(soup.stripped_strings)
266
  content = re.sub(r'\s+', ' ', content).strip()
267
 
268
+ if len(content.split()) < 30:
269
  logging.warning(f"Very little content extracted from {source['link']}")
270
  return source.get('snippet', ''), source_info
271
 
 
307
  content = result['choices'][0]['message']['content']
308
  sub_questions = extract_json_from_llm_response(content)
309
  if sub_questions and isinstance(sub_questions, list):
 
310
  cleaned = []
311
  for q in sub_questions:
312
  if isinstance(q, str) and q.strip():
313
  cleaned_q = re.sub(r'^[^a-zA-Z0-9]*|[^a-zA-Z0-9]*$', '', q)
314
  if cleaned_q:
315
  cleaned.append(cleaned_q)
316
+ return cleaned[:5]
317
 
318
  # Fallback if we couldn't get good questions from LLM
319
+ return [
320
  f"What is {query} and its key characteristics?",
321
  f"What are the main aspects or components of {query}?",
322
  f"What is the history and development of {query}?",
323
  f"What are the current trends or recent developments in {query}?",
324
  f"What are common challenges or controversies related to {query}?"
325
  ]
 
 
326
  except Exception as e:
327
  logging.error(f"Failed to generate research plan: {e}")
328
  return [
 
408
  "data": f"Found {len(unique_sources)} unique sources to process."
409
  })
410
 
 
411
  if not unique_sources:
412
  yield format_sse({
413
  "event": "error",
 
430
  for i, source in enumerate(unique_sources):
431
  # Check if we're running out of time
432
  elapsed = time.time() - start_time
433
+ if elapsed > RESEARCH_TIMEOUT * 0.7:
434
  yield format_sse({
435
  "event": "status",
436
  "data": f"Approaching time limit, stopping source processing at {i}/{len(unique_sources)}"
 
439
 
440
  # Add delay between processing each source to be polite
441
  if i > 0:
442
+ await asyncio.sleep(REQUEST_DELAY * 0.5)
443
 
444
  task = asyncio.create_task(process_with_semaphore(source))
445
  processing_tasks.append(task)
446
 
 
447
  if (i + 1) % 2 == 0 or (i + 1) == len(unique_sources):
448
  yield format_sse({
449
  "event": "status",
 
455
  processed_sources += 1
456
  content, source_info = await future
457
  if content and content.strip():
 
458
  consolidated_context += f"Source: {source_info['link']}\nContent: {content}\n\n---\n\n"
459
  all_sources_used.append(source_info)
460
  successful_sources += 1
461
+ total_tokens += len(content.split())
462
  else:
463
  processing_errors += 1
464
 
 
469
  })
470
  return
471
 
472
+ # Step 4: Synthesize report
473
  time_remaining = max(0, RESEARCH_TIMEOUT - (time.time() - start_time))
474
  yield format_sse({
475
  "event": "status",
476
  "data": f"Synthesizing report with content from {successful_sources} sources..."
477
  })
478
 
 
479
  max_output_tokens = min(1500, int(time_remaining * 5))
480
 
481
  report_prompt = f"""Compose a comprehensive research report on "{query}".
 
484
 
485
  Key requirements:
486
  1. Start with an introduction that explains what {query} is and why it's important
487
+ 2. Include well-organized sections with clear headings
488
  3. Cite specific information from sources where appropriate
489
  4. End with a conclusion that summarizes key findings and insights
490
  5. Keep the report concise but comprehensive
491
 
492
  Available information (summarized from {successful_sources} sources):
493
+ {consolidated_context[:18000]}
494
 
495
+ Generate a report that is approximately {max_output_tokens//4} words long.
 
496
  """
497
 
498
  report_payload = {
 
502
  "max_tokens": max_output_tokens
503
  }
504
 
 
505
  async with session.post(LLM_API_URL, headers=LLM_HEADERS, json=report_payload) as response:
506
  response.raise_for_status()
507
  async for line in response.content:
 
508
  if time.time() - start_time > RESEARCH_TIMEOUT:
509
  yield format_sse({
510
  "event": "warning",
 
524
  content = choices[0].get("delta", {}).get("content")
525
  if content:
526
  yield format_sse({"event": "chunk", "data": content})
 
 
527
  except Exception as e:
528
  logging.warning(f"Error processing stream chunk: {e}")
529
  continue
530
 
 
531
  duration = time.time() - start_time
532
  stats = {
533
  "total_time_seconds": round(duration),