rkihacker commited on
Commit
0359a50
·
verified ·
1 Parent(s): 6ac9507

Update main.py

Browse files
Files changed (1) hide show
  1. main.py +69 -26
main.py CHANGED
@@ -16,7 +16,7 @@ from bs4 import BeautifulSoup
16
 
17
  # --- Configuration ---
18
  logging.basicConfig(level=logging.INFO, format='%(asctime)s - %(levelname)s - %(message)s')
19
- logger = aiohttp.log.access_logger # Use aiohttp's logger for better async context
20
 
21
  load_dotenv()
22
  LLM_API_KEY = os.getenv("LLM_API_KEY")
@@ -30,6 +30,7 @@ else:
30
  LLM_API_URL = "https://api.typegpt.net/v1/chat/completions"
31
  LLM_MODEL = "meta-llama/Llama-4-Maverick-17B-128E-Instruct-FP8"
32
  MAX_SOURCES_TO_PROCESS = 15
 
33
 
34
  # Real Browser User Agents for SCRAPING
35
  USER_AGENTS = [
@@ -45,8 +46,8 @@ class DeepResearchRequest(BaseModel):
45
 
46
  app = FastAPI(
47
  title="AI Deep Research API",
48
- description="Provides robust, long-form, streaming deep research completions using a simulated search.",
49
- version="10.0.0" # Final: Using simulated search to bypass external blocking.
50
  )
51
 
52
  app.add_middleware(CORSMiddleware, allow_origins=["*"], allow_credentials=True, allow_methods=["*"], allow_headers=["*"])
@@ -58,33 +59,75 @@ def extract_json_from_llm_response(text: str) -> Optional[list]:
58
  except json.JSONDecodeError: return None
59
  return None
60
 
61
- async def call_duckduckgo_search(query: str, max_results: int = 10) -> List[dict]:
 
 
 
 
 
 
 
 
 
 
 
 
 
 
62
  """
63
- Simulates a successful DuckDuckGo search to bypass anti-scraping measures.
64
- This function returns a static, hardcoded list of relevant search results
65
- for the topic "Nian" (Chinese New Year beast), allowing the rest of the
66
- application pipeline to be tested.
67
  """
68
- logging.info(f"Simulating search for: '{query}'")
 
69
 
70
- # Static results related to "Nian" myth, as "niansuh" yields no results.
71
- # This provides the scraper with valid URLs to process.
72
- simulated_results = [
73
- {'title': 'Nian - Wikipedia', 'link': 'https://en.wikipedia.org/wiki/Nian', 'snippet': 'The Nian is a beast from Chinese mythology. The Nian is said to have the body of a bull, the head of a lion with a single horn, and sharp teeth.'},
74
- {'title': 'The Legend of Nian and the Origins of Chinese New Year', 'link': 'https://www.chinahighlights.com/travelguide/festivals/story-of-nian.htm', 'snippet': 'Learn about the monster Nian and how the traditions of wearing red, setting off firecrackers, and staying up late came to be part of Chinese New Year.'},
75
- {'title': 'Nian: The Beast That Invented Chinese New Year - Culture Trip', 'link': 'https://theculturetrip.com/asia/china/articles/nian-the-beast-that-invented-chinese-new-year', 'snippet': 'Once a year, at the beginning of Chinese New Year, a beast named Nian would terrorize a small village in China, eating their crops, livestock, and children.'},
76
- {'title': 'Chinese New Year mythology: The story of Nian - British Museum', 'link': 'https://www.britishmuseum.org/blog/chinese-new-year-mythology-story-nian', 'snippet': 'Discover the mythical origins of the Chinese New Year celebration and the fearsome beast, Nian.'},
77
- {'title': 'Year of the Nian Monster - Asian Art Museum', 'link': 'https://education.asianart.org/resources/year-of-the-nian-monster/', 'snippet': 'A summary of the story of the Nian monster for educators and children, explaining the connection to modern traditions.'}
78
- ]
79
 
80
- logging.info(f"Returning {len(simulated_results)} static sources.")
81
- return simulated_results[:max_results]
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
82
 
83
 
84
  async def research_and_process_source(session: aiohttp.ClientSession, source: dict) -> Tuple[str, dict]:
85
  headers = {'User-Agent': random.choice(USER_AGENTS)}
86
  try:
87
- logging.info(f"Scraping: {source['link']}")
88
  if source['link'].lower().endswith('.pdf'): raise ValueError("PDF content")
89
  async with session.get(source['link'], headers=headers, timeout=10, ssl=False) as response:
90
  if response.status != 200: raise ValueError(f"HTTP status {response.status}")
@@ -95,7 +138,7 @@ async def research_and_process_source(session: aiohttp.ClientSession, source: di
95
  if not content.strip(): raise ValueError("Parsed content is empty.")
96
  return content, source
97
  except Exception as e:
98
- logging.warning(f"Scraping failed for {source['link']} ({e}). Falling back to snippet.")
99
  return source.get('snippet', ''), source
100
 
101
  async def run_deep_research_stream(query: str) -> AsyncGenerator[str, None]:
@@ -114,13 +157,13 @@ async def run_deep_research_stream(query: str) -> AsyncGenerator[str, None]:
114
 
115
  yield format_sse({"event": "plan", "data": sub_questions})
116
 
117
- yield format_sse({"event": "status", "data": f"Searching sources for {len(sub_questions)} topics..."})
118
- search_tasks = [call_duckduckgo_search(sq) for sq in sub_questions]
119
  all_search_results = await asyncio.gather(*search_tasks)
120
  unique_sources = list({source['link']: source for results in all_search_results for source in results}.values())
121
 
122
  if not unique_sources:
123
- yield format_sse({"event": "error", "data": "The simulated search returned no sources. Check the hardcoded list."}); return
124
 
125
  sources_to_process = unique_sources[:MAX_SOURCES_TO_PROCESS]
126
  yield format_sse({"event": "status", "data": f"Found {len(unique_sources)} unique sources. Processing the top {len(sources_to_process)}..."})
@@ -135,7 +178,7 @@ async def run_deep_research_stream(query: str) -> AsyncGenerator[str, None]:
135
  all_sources_used.append(source_info)
136
 
137
  if not consolidated_context.strip():
138
- yield format_sse({"event": "error", "data": "Failed to scrape content from any of the discovered sources."}); return
139
 
140
  yield format_sse({"event": "status", "data": "Synthesizing final report..."})
141
  report_prompt = f'Synthesize the provided context into a long-form, comprehensive, multi-page report on "{query}". Use markdown. Elaborate extensively on each point. Base your entire report ONLY on the provided context.\n\n## Research Context ##\n{consolidated_context}'
 
16
 
17
  # --- Configuration ---
18
  logging.basicConfig(level=logging.INFO, format='%(asctime)s - %(levelname)s - %(message)s')
19
+ logger = logging.getLogger(__name__)
20
 
21
  load_dotenv()
22
  LLM_API_KEY = os.getenv("LLM_API_KEY")
 
30
  LLM_API_URL = "https://api.typegpt.net/v1/chat/completions"
31
  LLM_MODEL = "meta-llama/Llama-4-Maverick-17B-128E-Instruct-FP8"
32
  MAX_SOURCES_TO_PROCESS = 15
33
+ SEARCH_PAGES_TO_FETCH = 2 # Fetch first 2 pages of results for each query
34
 
35
  # Real Browser User Agents for SCRAPING
36
  USER_AGENTS = [
 
46
 
47
  app = FastAPI(
48
  title="AI Deep Research API",
49
+ description="Provides robust, long-form, streaming deep research completions using a live, multi-page DuckDuckGo search.",
50
+ version="11.0.0" # Implemented robust, multi-page live web search
51
  )
52
 
53
  app.add_middleware(CORSMiddleware, allow_origins=["*"], allow_credentials=True, allow_methods=["*"], allow_headers=["*"])
 
59
  except json.JSONDecodeError: return None
60
  return None
61
 
62
+ def parse_search_results(soup: BeautifulSoup) -> List[dict]:
63
+ """Helper to parse results from a BeautifulSoup object."""
64
+ results = []
65
+ for result_div in soup.find_all('div', class_='result'):
66
+ title_elem = result_div.find('a', class_='result__a')
67
+ snippet_elem = result_div.find('a', class_='result__snippet')
68
+ if title_elem and snippet_elem:
69
+ link = title_elem.get('href')
70
+ title = title_elem.get_text(strip=True)
71
+ snippet = snippet_elem.get_text(strip=True)
72
+ if link and title and snippet:
73
+ results.append({'title': title, 'link': link, 'snippet': snippet})
74
+ return results
75
+
76
+ async def call_duckduckgo_search(session: aiohttp.ClientSession, query: str, max_results: int = 15) -> List[dict]:
77
  """
78
+ Performs a robust, multi-page search on DuckDuckGo's HTML interface.
 
 
 
79
  """
80
+ logger.info(f"Starting multi-page search for: '{query}'")
81
+ search_url = "https://html.duckduckgo.com/html/"
82
 
83
+ headers = {
84
+ 'Content-Type': 'application/x-www-form-urlencoded',
85
+ 'User-Agent': random.choice(USER_AGENTS),
86
+ 'Referer': 'https://html.duckduckgo.com/'
87
+ }
 
 
 
 
88
 
89
+ all_results = []
90
+ payload = {'q': query}
91
+
92
+ try:
93
+ for page in range(SEARCH_PAGES_TO_FETCH):
94
+ logger.info(f"Searching page {page + 1} for '{query}'...")
95
+ async with session.post(search_url, data=payload, headers=headers, timeout=15) as response:
96
+ if response.status != 200:
97
+ logger.warning(f"Search for '{query}' page {page+1} returned status {response.status}. Stopping search for this query.")
98
+ break
99
+
100
+ html = await response.text()
101
+ soup = BeautifulSoup(html, "html.parser")
102
+
103
+ page_results = parse_search_results(soup)
104
+ all_results.extend(page_results)
105
+
106
+ # Find the 'Next' form to get parameters for the next page request
107
+ next_form = soup.find('form', action='/html/', method='post', string=lambda t: t and 'Next' in t)
108
+ if not next_form:
109
+ logger.info(f"No 'Next' page found for '{query}'. Ending search.")
110
+ break
111
+
112
+ # Update payload with hidden inputs for the next page
113
+ payload = {inp.get('name'): inp.get('value') for inp in next_form.find_all('input')}
114
+ if not payload:
115
+ logger.info(f"Could not find parameters for next page. Ending search.")
116
+ break
117
+
118
+ await asyncio.sleep(random.uniform(0.5, 1.5)) # Small delay to mimic human behavior
119
+
120
+ except Exception as e:
121
+ logger.error(f"An error occurred during multi-page search for '{query}': {e}", exc_info=True)
122
+
123
+ logger.info(f"Found a total of {len(all_results)} sources from {SEARCH_PAGES_TO_FETCH} pages for: '{query}'")
124
+ return all_results[:max_results]
125
 
126
 
127
  async def research_and_process_source(session: aiohttp.ClientSession, source: dict) -> Tuple[str, dict]:
128
  headers = {'User-Agent': random.choice(USER_AGENTS)}
129
  try:
130
+ logger.info(f"Scraping: {source['link']}")
131
  if source['link'].lower().endswith('.pdf'): raise ValueError("PDF content")
132
  async with session.get(source['link'], headers=headers, timeout=10, ssl=False) as response:
133
  if response.status != 200: raise ValueError(f"HTTP status {response.status}")
 
138
  if not content.strip(): raise ValueError("Parsed content is empty.")
139
  return content, source
140
  except Exception as e:
141
+ logger.warning(f"Scraping failed for {source['link']} ({e}). Falling back to snippet.")
142
  return source.get('snippet', ''), source
143
 
144
  async def run_deep_research_stream(query: str) -> AsyncGenerator[str, None]:
 
157
 
158
  yield format_sse({"event": "plan", "data": sub_questions})
159
 
160
+ yield format_sse({"event": "status", "data": f"Performing deep search for {len(sub_questions)} topics..."})
161
+ search_tasks = [call_duckduckgo_search(session, sq) for sq in sub_questions]
162
  all_search_results = await asyncio.gather(*search_tasks)
163
  unique_sources = list({source['link']: source for results in all_search_results for source in results}.values())
164
 
165
  if not unique_sources:
166
+ yield format_sse({"event": "error", "data": f"The live multi-page search could not find any relevant sources for '{query}'. The topic might be too obscure."}); return
167
 
168
  sources_to_process = unique_sources[:MAX_SOURCES_TO_PROCESS]
169
  yield format_sse({"event": "status", "data": f"Found {len(unique_sources)} unique sources. Processing the top {len(sources_to_process)}..."})
 
178
  all_sources_used.append(source_info)
179
 
180
  if not consolidated_context.strip():
181
+ yield format_sse({"event": "error", "data": "Found sources, but failed to scrape meaningful content from any of them."}); return
182
 
183
  yield format_sse({"event": "status", "data": "Synthesizing final report..."})
184
  report_prompt = f'Synthesize the provided context into a long-form, comprehensive, multi-page report on "{query}". Use markdown. Elaborate extensively on each point. Base your entire report ONLY on the provided context.\n\n## Research Context ##\n{consolidated_context}'