Spaces:

rkihacker
/

Scrap

Paused

App Files Files Community

rkihacker commited on Sep 15

Commit

64f616b

verified ·

1 Parent(s): 4906187

Update main.py

Browse files

Files changed (1) hide show

main.py +49 -68

main.py CHANGED Viewed

@@ -29,13 +29,9 @@ else:
 # --- Constants & Headers ---
 LLM_API_URL = "https://api.typegpt.net/v1/chat/completions"
 LLM_MODEL = "gpt-4.1-mini"
-# Automatic Context Sizing
-TARGET_TOKEN_LIMIT = 28000
-ESTIMATED_CHARS_PER_TOKEN = 4
-MAX_CONTEXT_CHAR_LENGTH = TARGET_TOKEN_LIMIT * ESTIMATED_CHARS_PER_TOKEN
-# Real Browser User Agents for Rotation (Used for both search and scraping)
 USER_AGENTS = [
     "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/128.0.0.0 Safari/537.36",
     "Mozilla/5.0 (Macintosh; Intel Mac OS X 10_15_7) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/128.0.0.0 Safari/537.36",
@@ -56,71 +52,40 @@ def extract_json_from_llm_response(text: str) -> Optional[list]:
 app = FastAPI(
     title="AI Deep Research API",
-    description="Provides robust, streaming deep research completions using DuckDuckGo Search.",
-    version="5.0.0"  # Final Production Version with new Search Provider
 )
 # --- Core Service Functions ---
-# ***** THE NEW SEARCH FUNCTION *****
 async def call_duckduckgo_search(session: aiohttp.ClientSession, query: str) -> List[dict]:
-    """Performs a search using DuckDuckGo's HTML interface and parses the results."""
     search_url = f"https://html.duckduckgo.com/html/?q={quote_plus(query)}"
-    logger.info(f"Searching DuckDuckGo for: '{query}'")
     headers = {'User-Agent': random.choice(USER_AGENTS)}
     try:
         async with session.get(search_url, headers=headers, timeout=15) as response:
-            response.raise_for_status()
-            html = await response.text()
-            soup = BeautifulSoup(html, "html.parser")
-            results = []
-            # Find all result containers, which have a class 'result'
-            for result_container in soup.find_all('div', class_='result'):
-                title_tag = result_container.find('a', class_='result__a')
-                snippet_tag = result_container.find('a', class_='result__snippet')
-                if title_tag and snippet_tag and title_tag.has_attr('href'):
-                    # The link in DDG's HTML version is a redirect, so we need to clean it
-                    raw_link = title_tag['href']
-                    cleaned_link = re.sub(r'/l/\?kh=-1&uddg=', '', raw_link)
-                    results.append({
-                        'title': title_tag.get_text(strip=True),
-                        'link': cleaned_link,
-                        'snippet': snippet_tag.get_text(strip=True)
-                    })
             logger.info(f"Found {len(results)} sources from DuckDuckGo for: '{query}'")
             return results
     except Exception as e:
         logger.error(f"DuckDuckGo search failed for query '{query}': {e}"); return []
 async def research_and_process_source(session: aiohttp.ClientSession, source: dict) -> Tuple[str, dict]:
-    """Scrapes a single source and falls back to its snippet if scraping fails."""
-    logger.info(f"Processing source: {source['link']}")
     headers = {'User-Agent': random.choice(USER_AGENTS)}
     try:
-        if source['link'].lower().endswith('.pdf'):
-            raise ValueError("PDF content cannot be scraped.")
         async with session.get(source['link'], headers=headers, timeout=10, ssl=False) as response:
-            if response.status != 200:
-                raise ValueError(f"HTTP status {response.status}")
             html = await response.text()
             soup = BeautifulSoup(html, "html.parser")
             for tag in soup(['script', 'style', 'nav', 'footer', 'header', 'aside']): tag.decompose()
             content = " ".join(soup.stripped_strings)
-            if not content.strip(): # Check if parsed content is empty
-                raise ValueError("Parsed content is empty.")
             return content, source
     except Exception as e:
         logger.warning(f"Scraping failed for {source['link']} ({e}). Falling back to snippet.")
         return source.get('snippet', ''), source
@@ -143,40 +108,49 @@ async def run_deep_research_stream(query: str) -> AsyncGenerator[str, None]:
             yield format_sse({"event": "plan", "data": sub_questions})
-            # Step 2: Conduct Research in Parallel using DuckDuckGo
             yield format_sse({"event": "status", "data": f"Searching sources for {len(sub_questions)} topics..."})
             search_tasks = [call_duckduckgo_search(session, sq) for sq in sub_questions]
             all_search_results = await asyncio.gather(*search_tasks)
-            unique_sources = list({source['link']: source for results in all_search_results for source in results if 'link' in source and 'snippet' in source}.values())
             if not unique_sources:
-                yield format_sse({"event": "error", "data": "All search queries returned zero usable sources from DuckDuckGo."}); return
-            yield format_sse({"event": "status", "data": f"Found {len(unique_sources)} unique sources. Processing..."})
-            processing_tasks = [research_and_process_source(session, source) for source in unique_sources]
             consolidated_context, all_sources_used = "", []
-            successful_scrapes = 0
             for task in asyncio.as_completed(processing_tasks):
                 content, source_info = await task
                 if content:
                     consolidated_context += f"Source: {source_info['link']}\nContent: {content}\n\n---\n\n"
                     all_sources_used.append(source_info)
-                    if not content == source_info.get('snippet'): successful_scrapes += 1
-            logger.info(f"Context complete. Scraped {successful_scrapes}/{len(unique_sources)} pages. Used {len(all_sources_used)} total sources (with snippet fallbacks).")
             if not consolidated_context.strip():
-                yield format_sse({"event": "error", "data": "Failed to gather any research context from scraping or snippets."}); return
-            # Step 3: Synthesize Final Report
             yield format_sse({"event": "status", "data": "Synthesizing final report..."})
-            if len(consolidated_context) > MAX_CONTEXT_CHAR_LENGTH:
-                consolidated_context = consolidated_context[:MAX_CONTEXT_CHAR_LENGTH]
-            report_prompt = f'Synthesize the provided context into a comprehensive, well-structured report on "{query}". Use markdown. Context:\n{consolidated_context}'
             report_payload = {"model": LLM_MODEL, "messages": [{"role": "user", "content": report_prompt}], "stream": True}
             async with session.post(LLM_API_URL, headers=LLM_HEADERS, json=report_payload) as response:
@@ -185,12 +159,19 @@ async def run_deep_research_stream(query: str) -> AsyncGenerator[str, None]:
                     line_str = line.decode('utf-8').strip()
                     if line_str.startswith('data:'): line_str = line_str[5:].strip()
                     if line_str == "[DONE]": break
                     try:
                         chunk = json.loads(line_str)
-                        content = chunk.get("choices", [{}])[0].get("delta", {}).get("content")
-                        if content: yield format_sse({"event": "chunk", "data": content})
-                    except json.JSONDecodeError: continue
             yield format_sse({"event": "sources", "data": all_sources_used})
     except Exception as e:
         logger.error(f"A critical error occurred: {e}", exc_info=True)

 # --- Constants & Headers ---
 LLM_API_URL = "https://api.typegpt.net/v1/chat/completions"
 LLM_MODEL = "gpt-4.1-mini"
+MAX_SOURCES_TO_PROCESS = 15 # Increase research depth for longer reports
+# Real Browser User Agents for Rotation
 USER_AGENTS = [
     "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/128.0.0.0 Safari/537.36",
     "Mozilla/5.0 (Macintosh; Intel Mac OS X 10_15_7) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/128.0.0.0 Safari/537.36",
 app = FastAPI(
     title="AI Deep Research API",
+    description="Provides robust, long-form, streaming deep research completions.",
+    version="6.0.0"  # Final Production Version
 )
 # --- Core Service Functions ---
 async def call_duckduckgo_search(session: aiohttp.ClientSession, query: str) -> List[dict]:
     search_url = f"https://html.duckduckgo.com/html/?q={quote_plus(query)}"
     headers = {'User-Agent': random.choice(USER_AGENTS)}
     try:
         async with session.get(search_url, headers=headers, timeout=15) as response:
+            response.raise_for_status(); html = await response.text()
+            soup = BeautifulSoup(html, "html.parser"); results = []
+            for res in soup.find_all('div', class_='result'):
+                title_tag, snippet_tag = res.find('a', class_='result__a'), res.find('a', class_='result__snippet')
+                if title_tag and snippet_tag and 'href' in title_tag.attrs:
+                    cleaned_link = re.sub(r'/l/\?kh=-1&uddg=', '', title_tag['href'])
+                    results.append({'title': title_tag.text, 'link': cleaned_link, 'snippet': snippet_tag.text})
             logger.info(f"Found {len(results)} sources from DuckDuckGo for: '{query}'")
             return results
     except Exception as e:
         logger.error(f"DuckDuckGo search failed for query '{query}': {e}"); return []
 async def research_and_process_source(session: aiohttp.ClientSession, source: dict) -> Tuple[str, dict]:
     headers = {'User-Agent': random.choice(USER_AGENTS)}
     try:
+        if source['link'].lower().endswith('.pdf'): raise ValueError("PDF content")
         async with session.get(source['link'], headers=headers, timeout=10, ssl=False) as response:
+            if response.status != 200: raise ValueError(f"HTTP status {response.status}")
             html = await response.text()
             soup = BeautifulSoup(html, "html.parser")
             for tag in soup(['script', 'style', 'nav', 'footer', 'header', 'aside']): tag.decompose()
             content = " ".join(soup.stripped_strings)
+            if not content.strip(): raise ValueError("Parsed content is empty.")
             return content, source
     except Exception as e:
         logger.warning(f"Scraping failed for {source['link']} ({e}). Falling back to snippet.")
         return source.get('snippet', ''), source
             yield format_sse({"event": "plan", "data": sub_questions})
+            # Step 2: Conduct Deep Research
             yield format_sse({"event": "status", "data": f"Searching sources for {len(sub_questions)} topics..."})
             search_tasks = [call_duckduckgo_search(session, sq) for sq in sub_questions]
             all_search_results = await asyncio.gather(*search_tasks)
+            unique_sources = list({source['link']: source for results in all_search_results for source in results}.values())
             if not unique_sources:
+                yield format_sse({"event": "error", "data": "All search queries returned zero usable sources."}); return
+            # Limit the number of sources to process for very long reports
+            sources_to_process = unique_sources[:MAX_SOURCES_TO_PROCESS]
+            yield format_sse({"event": "status", "data": f"Found {len(unique_sources)} unique sources. Processing the top {len(sources_to_process)}..."})
+            processing_tasks = [research_and_process_source(session, source) for source in sources_to_process]
             consolidated_context, all_sources_used = "", []
             for task in asyncio.as_completed(processing_tasks):
                 content, source_info = await task
                 if content:
                     consolidated_context += f"Source: {source_info['link']}\nContent: {content}\n\n---\n\n"
                     all_sources_used.append(source_info)
             if not consolidated_context.strip():
+                yield format_sse({"event": "error", "data": "Failed to gather any research context."}); return
+            # Step 3: Synthesize Long-Form Final Report
             yield format_sse({"event": "status", "data": "Synthesizing final report..."})
+            # ***** ENHANCED PROMPT FOR LONGEST POSSIBLE REPORT *****
+            report_prompt = f"""
+You are an expert research analyst. Your task is to synthesize the provided context into a long-form, comprehensive, multi-page report on the topic: "{query}".
+Follow these instructions carefully:
+1.  Write in a professional, academic tone.
+2.  Structure the report with a clear introduction, multiple detailed sections with sub-headings using Markdown, and a concluding summary.
+3.  Elaborate extensively on each point. Use multiple paragraphs for each section to explore the nuances of the topic.
+4.  Base your entire report *only* on the information provided in the context below. Do not use any external knowledge.
+5.  Aim for the most detailed and thorough report possible based on the given material.
+## Research Context ##
+{consolidated_context}
+"""
             report_payload = {"model": LLM_MODEL, "messages": [{"role": "user", "content": report_prompt}], "stream": True}
             async with session.post(LLM_API_URL, headers=LLM_HEADERS, json=report_payload) as response:
                     line_str = line.decode('utf-8').strip()
                     if line_str.startswith('data:'): line_str = line_str[5:].strip()
                     if line_str == "[DONE]": break
+                    # ***** FIX FOR 'list index out of range' ERROR *****
                     try:
                         chunk = json.loads(line_str)
+                        choices = chunk.get("choices")
+                        if choices and isinstance(choices, list) and len(choices) > 0:
+                            content = choices[0].get("delta", {}).get("content")
+                            if content:
+                                yield format_sse({"event": "chunk", "data": content})
+                    except json.JSONDecodeError:
+                        continue # Ignore malformed lines
+            # Final event with all source data
             yield format_sse({"event": "sources", "data": all_sources_used})
     except Exception as e:
         logger.error(f"A critical error occurred: {e}", exc_info=True)