Spaces:

rkihacker
/

Scrap

Sleeping

App Files Files Community

rkihacker commited on Sep 17

Commit

6c6c904

verified ·

1 Parent(s): 277b708

Update main.py

Browse files

Files changed (1) hide show

main.py +515 -89

main.py CHANGED Viewed

@@ -4,23 +4,27 @@ import json
 import logging
 import random
 import re
-from typing import AsyncGenerator, Optional, Tuple, List
-from fastapi import FastAPI
 from fastapi.responses import StreamingResponse
 from fastapi.middleware.cors import CORSMiddleware
 from pydantic import BaseModel
 from dotenv import load_dotenv
 import aiohttp
 from bs4 import BeautifulSoup
 # --- Configuration ---
-logging.basicConfig(level=logging.INFO, format='%(asctime)s - %(levelname)s - %(message)s')
-logger = aiohttp.log.access_logger # Use aiohttp's logger for better async context
 load_dotenv()
-LLM_API_KEY = os.getenv("LLM_API_KEY")
 if not LLM_API_KEY:
     raise RuntimeError("LLM_API_KEY must be set in a .env file.")
 else:
@@ -29,141 +33,563 @@ else:
 # --- Constants & Headers ---
 LLM_API_URL = "https://api.typegpt.net/v1/chat/completions"
 LLM_MODEL = "meta-llama/Llama-4-Maverick-17B-128E-Instruct-FP8"
-MAX_SOURCES_TO_PROCESS = 15
-# Real Browser User Agents for SCRAPING
-USER_AGENTS = [
-    "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/128.0.0.0 Safari/537.36",
-    "Mozilla/5.0 (Macintosh; Intel Mac OS X 10_15_7) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/128.0.0.0 Safari/537.36",
-    "Mozilla/5.0 (Windows NT 10.0; Win64; x64; rv:129.0) Gecko/20100101 Firefox/129.0"
-]
-LLM_HEADERS = {"Authorization": f"Bearer {LLM_API_KEY}", "Content-Type": "application/json", "Accept": "application/json"}
 class DeepResearchRequest(BaseModel):
     query: str
 app = FastAPI(
     title="AI Deep Research API",
-    description="Provides robust, long-form, streaming deep research completions using a simulated search.",
-    version="10.0.0"  # Final: Using simulated search to bypass external blocking.
 )
-app.add_middleware(CORSMiddleware, allow_origins=["*"], allow_credentials=True, allow_methods=["*"], allow_headers=["*"])
 def extract_json_from_llm_response(text: str) -> Optional[list]:
     match = re.search(r'\[.*\]', text, re.DOTALL)
     if match:
-        try: return json.loads(match.group(0))
-        except json.JSONDecodeError: return None
     return None
-async def call_duckduckgo_search(query: str, max_results: int = 10) -> List[dict]:
     """
-    Simulates a successful DuckDuckGo search to bypass anti-scraping measures.
-    This function returns a static, hardcoded list of relevant search results
-    for the topic "Nian" (Chinese New Year beast), allowing the rest of the
-    application pipeline to be tested.
     """
-    logging.info(f"Simulating search for: '{query}'")
-    # Static results related to "Nian" myth, as "niansuh" yields no results.
-    # This provides the scraper with valid URLs to process.
-    simulated_results = [
-        {'title': 'Nian - Wikipedia', 'link': 'https://en.wikipedia.org/wiki/Nian', 'snippet': 'The Nian is a beast from Chinese mythology. The Nian is said to have the body of a bull, the head of a lion with a single horn, and sharp teeth.'},
-        {'title': 'The Legend of Nian and the Origins of Chinese New Year', 'link': 'https://www.chinahighlights.com/travelguide/festivals/story-of-nian.htm', 'snippet': 'Learn about the monster Nian and how the traditions of wearing red, setting off firecrackers, and staying up late came to be part of Chinese New Year.'},
-        {'title': 'Nian: The Beast That Invented Chinese New Year - Culture Trip', 'link': 'https://theculturetrip.com/asia/china/articles/nian-the-beast-that-invented-chinese-new-year', 'snippet': 'Once a year, at the beginning of Chinese New Year, a beast named Nian would terrorize a small village in China, eating their crops, livestock, and children.'},
-        {'title': 'Chinese New Year mythology: The story of Nian - British Museum', 'link': 'https://www.britishmuseum.org/blog/chinese-new-year-mythology-story-nian', 'snippet': 'Discover the mythical origins of the Chinese New Year celebration and the fearsome beast, Nian.'},
-        {'title': 'Year of the Nian Monster - Asian Art Museum', 'link': 'https://education.asianart.org/resources/year-of-the-nian-monster/', 'snippet': 'A summary of the story of the Nian monster for educators and children, explaining the connection to modern traditions.'}
-    ]
-    logging.info(f"Returning {len(simulated_results)} static sources.")
-    return simulated_results[:max_results]
-async def research_and_process_source(session: aiohttp.ClientSession, source: dict) -> Tuple[str, dict]:
-    headers = {'User-Agent': random.choice(USER_AGENTS)}
     try:
-        logging.info(f"Scraping: {source['link']}")
-        if source['link'].lower().endswith('.pdf'): raise ValueError("PDF content")
-        async with session.get(source['link'], headers=headers, timeout=10, ssl=False) as response:
-            if response.status != 200: raise ValueError(f"HTTP status {response.status}")
             html = await response.text()
             soup = BeautifulSoup(html, "html.parser")
-            for tag in soup(['script', 'style', 'nav', 'footer', 'header', 'aside']): tag.decompose()
-            content = " ".join(soup.stripped_strings)
-            if not content.strip(): raise ValueError("Parsed content is empty.")
-            return content, source
     except Exception as e:
-        logging.warning(f"Scraping failed for {source['link']} ({e}). Falling back to snippet.")
-        return source.get('snippet', ''), source
 async def run_deep_research_stream(query: str) -> AsyncGenerator[str, None]:
-    def format_sse(data: dict) -> str: return f"data: {json.dumps(data)}\n\n"
     try:
-        async with aiohttp.ClientSession() as session:
-            yield format_sse({"event": "status", "data": "Generating research plan..."})
-            plan_prompt = {"model": LLM_MODEL, "messages": [{"role": "user", "content": f"Generate 3-4 key sub-questions for a research report on '{query}'. Your response MUST be ONLY the raw JSON array. Example: [\"Question 1?\"]"}]}
-            try:
-                async with session.post(LLM_API_URL, headers=LLM_HEADERS, json=plan_prompt, timeout=25) as response:
-                    response.raise_for_status(); result = await response.json()
-                    sub_questions = result if isinstance(result, list) else extract_json_from_llm_response(result['choices'][0]['message']['content'])
-                    if not isinstance(sub_questions, list) or not sub_questions: raise ValueError(f"Invalid plan from LLM: {result}")
-            except Exception as e:
-                yield format_sse({"event": "error", "data": f"Could not generate research plan. Reason: {e}"}); return
             yield format_sse({"event": "plan", "data": sub_questions})
-            yield format_sse({"event": "status", "data": f"Searching sources for {len(sub_questions)} topics..."})
-            search_tasks = [call_duckduckgo_search(sq) for sq in sub_questions]
-            all_search_results = await asyncio.gather(*search_tasks)
-            unique_sources = list({source['link']: source for results in all_search_results for source in results}.values())
             if not unique_sources:
-                yield format_sse({"event": "error", "data": "The simulated search returned no sources. Check the hardcoded list."}); return
-            sources_to_process = unique_sources[:MAX_SOURCES_TO_PROCESS]
-            yield format_sse({"event": "status", "data": f"Found {len(unique_sources)} unique sources. Processing the top {len(sources_to_process)}..."})
-            processing_tasks = [research_and_process_source(session, source) for source in sources_to_process]
-            consolidated_context, all_sources_used = "", []
-            for task in asyncio.as_completed(processing_tasks):
-                content, source_info = await task
                 if content and content.strip():
                     consolidated_context += f"Source: {source_info['link']}\nContent: {content}\n\n---\n\n"
                     all_sources_used.append(source_info)
             if not consolidated_context.strip():
-                yield format_sse({"event": "error", "data": "Failed to scrape content from any of the discovered sources."}); return
-            yield format_sse({"event": "status", "data": "Synthesizing final report..."})
-            report_prompt = f'Synthesize the provided context into a long-form, comprehensive, multi-page report on "{query}". Use markdown. Elaborate extensively on each point. Base your entire report ONLY on the provided context.\n\n## Research Context ##\n{consolidated_context}'
-            report_payload = {"model": LLM_MODEL, "messages": [{"role": "user", "content": report_prompt}], "stream": True}
             async with session.post(LLM_API_URL, headers=LLM_HEADERS, json=report_payload) as response:
                 response.raise_for_status()
                 async for line in response.content:
                     line_str = line.decode('utf-8').strip()
-                    if line_str.startswith('data:'): line_str = line_str[5:].strip()
-                    if line_str == "[DONE]": break
                     try:
                         chunk = json.loads(line_str)
                         choices = chunk.get("choices")
                         if choices and isinstance(choices, list) and len(choices) > 0:
                             content = choices[0].get("delta", {}).get("content")
-                            if content: yield format_sse({"event": "chunk", "data": content})
-                    except json.JSONDecodeError: continue
             yield format_sse({"event": "sources", "data": all_sources_used})
     except Exception as e:
-        logging.error(f"A critical error occurred: {e}", exc_info=True)
-        yield format_sse({"event": "error", "data": f"An unexpected error occurred: {str(e)}"})
 @app.post("/deep-research", response_class=StreamingResponse)
 async def deep_research_endpoint(request: DeepResearchRequest):
-    return StreamingResponse(run_deep_research_stream(request.query), media_type="text/event-stream")
 if __name__ == "__main__":
     import uvicorn
-    uvicorn.run(app, host="0.0.0.0", port=8000)

 import logging
 import random
 import re
+import time
+from typing import AsyncGenerator, Optional, Tuple, List, Dict
+from urllib.parse import quote_plus
+from fastapi import FastAPI, HTTPException
 from fastapi.responses import StreamingResponse
 from fastapi.middleware.cors import CORSMiddleware
 from pydantic import BaseModel
 from dotenv import load_dotenv
 import aiohttp
 from bs4 import BeautifulSoup
+from fake_useragent import UserAgent
 # --- Configuration ---
+logging.basicConfig(
+    level=logging.INFO,
+    format='%(asctime)s - %(levelname)s - %(message)s'
+)
+logger = logging.getLogger(__name__)
 load_dotenv()
+LLM_API_KEY = os.getenv("LLM_API_KEY")
 if not LLM_API_KEY:
     raise RuntimeError("LLM_API_KEY must be set in a .env file.")
 else:
 # --- Constants & Headers ---
 LLM_API_URL = "https://api.typegpt.net/v1/chat/completions"
 LLM_MODEL = "meta-llama/Llama-4-Maverick-17B-128E-Instruct-FP8"
+MAX_SOURCES_TO_PROCESS = 6  # Reduced to stay within time limits with real requests
+MAX_CONCURRENT_REQUESTS = 3  # Be conservative with real websites
+RESEARCH_TIMEOUT = 180  # 3 minutes maximum
+REQUEST_DELAY = 2.0  # Longer delay between requests to be more polite
+USER_AGENT_ROTATION = True
+# Initialize fake user agent generator
+try:
+    ua = UserAgent()
+except:
+    # Fallback if fake_useragent isn't available
+    class SimpleUA:
+        def random(self):
+            return random.choice([
+                "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/128.0.0.0 Safari/537.36",
+                "Mozilla/5.0 (Macintosh; Intel Mac OS X 10_15_7) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/128.0.0.0 Safari/537.36",
+                "Mozilla/5.0 (Windows NT 10.0; Win64; x64; rv:129.0) Gecko/20100101 Firefox/129.0"
+            ])
+    ua = SimpleUA()
+LLM_HEADERS = {
+    "Authorization": f"Bearer {LLM_API_KEY}",
+    "Content-Type": "application/json",
+    "Accept": "application/json"
+}
 class DeepResearchRequest(BaseModel):
     query: str
 app = FastAPI(
     title="AI Deep Research API",
+    description="Provides robust, long-form, streaming deep research completions using real web searches.",
+    version="2.1.0"  # Updated version
+)
+app.add_middleware(
+    CORSMiddleware,
+    allow_origins=["*"],
+    allow_credentials=True,
+    allow_methods=["*"],
+    allow_headers=["*"]
 )
 def extract_json_from_llm_response(text: str) -> Optional[list]:
+    """Extract JSON array from LLM response text."""
     match = re.search(r'\[.*\]', text, re.DOTALL)
     if match:
+        try:
+            return json.loads(match.group(0))
+        except json.JSONDecodeError:
+            return None
     return None
+async def get_real_user_agent() -> str:
+    """Get a realistic user agent string."""
+    if USER_AGENT_ROTATION:
+        return ua.random()
+    return "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/128.0.0.0 Safari/537.36"
+async def check_robots_txt(url: str) -> bool:
+    """Check if scraping is allowed by robots.txt."""
+    try:
+        domain = re.search(r'https?://([^/]+)', url)
+        if not domain:
+            return False
+        domain = domain.group(1)
+        robots_url = f"https://{domain}/robots.txt"
+        async with aiohttp.ClientSession() as session:
+            headers = {'User-Agent': await get_real_user_agent()}
+            async with session.get(robots_url, headers=headers, timeout=5) as response:
+                if response.status == 200:
+                    robots = await response.text()
+                    # Simple check - disallow all if present
+                    if "Disallow: /" in robots:
+                        return False
+                    # Check for specific disallow rules for our path
+                    path = re.sub(r'https?://[^/]+', '', url)
+                    if f"Disallow: {path}" in robots:
+                        return False
+        return True
+    except Exception as e:
+        logging.warning(f"Could not check robots.txt for {url}: {e}")
+        return False  # Default to not scraping if we can't check
+async def fetch_search_results(query: str, max_results: int = 5) -> List[dict]:
+    """
+    Perform a real search using DuckDuckGo's HTML interface.
+    Note: This may break if DuckDuckGo changes their HTML structure.
+    """
+    try:
+        search_url = f"https://html.duckduckgo.com/html/?q={quote_plus(query)}"
+        headers = {
+            "User-Agent": await get_real_user_agent(),
+            "Accept": "text/html,application/xhtml+xml,application/xml;q=0.9,image/webp,*/*;q=0.8",
+            "Accept-Language": "en-US,en;q=0.5",
+            "Referer": "https://duckduckgo.com/",
+            "DNT": "1"
+        }
+        async with aiohttp.ClientSession() as session:
+            async with session.get(search_url, headers=headers, timeout=10) as response:
+                if response.status != 200:
+                    logging.warning(f"Search failed with status {response.status}")
+                    return []
+                html = await response.text()
+                soup = BeautifulSoup(html, 'html.parser')
+                results = []
+                # Updated selectors for DuckDuckGo's current HTML structure
+                for result in soup.select('.result')[:max_results]:
+                    try:
+                        title_elem = result.select_one('.result__title .result__a')
+                        link_elem = title_elem if title_elem else result.select_one('a')
+                        snippet_elem = result.select_one('.result__snippet')
+                        if title_elem and link_elem and snippet_elem:
+                            # Clean up the URL
+                            link = link_elem['href']
+                            if link.startswith('/l/'):
+                                # DuckDuckGo returns relative links that redirect
+                                # We need to follow these to get the actual URL
+                                try:
+                                    redirect_url = f"https://duckduckgo.com{link}"
+                                    async with session.get(redirect_url, headers=headers, timeout=5, allow_redirects=False) as redirect_resp:
+                                        if redirect_resp.status == 302:
+                                            link = redirect_resp.headers.get('Location', link)
+                                except Exception as e:
+                                    logging.warning(f"Could not follow redirect for {link}: {e}")
+                                    continue
+                            results.append({
+                                'title': title_elem.get_text(strip=True),
+                                'link': link,
+                                'snippet': snippet_elem.get_text(strip=True)
+                            })
+                    except Exception as e:
+                        logging.warning(f"Error parsing search result: {e}")
+                        continue
+                logging.info(f"Found {len(results)} real search results for '{query}'")
+                return results
+    except Exception as e:
+        logging.error(f"Real search failed: {e}")
+        return []
+async def process_web_source(session: aiohttp.ClientSession, source: dict, timeout: int = 15) -> Tuple[str, dict]:
     """
+    Process a real web source with improved content extraction and error handling.
     """
+    headers = {'User-Agent': await get_real_user_agent()}
+    source_info = source.copy()
+    # Check robots.txt first
+    if not await check_robots_txt(source['link']):
+        logging.info(f"Scraping disallowed by robots.txt for {source['link']}")
+        return source.get('snippet', ''), source_info
     try:
+        logging.info(f"Processing source: {source['link']}")
+        start_time = time.time()
+        # Skip non-HTML content
+        if any(source['link'].lower().endswith(ext) for ext in ['.pdf', '.doc', '.docx', '.ppt', '.pptx', '.xls', '.xlsx']):
+            logging.info(f"Skipping non-HTML content at {source['link']}")
+            return source.get('snippet', ''), source_info
+        # Add delay between requests to be polite
+        await asyncio.sleep(REQUEST_DELAY)
+        async with session.get(source['link'], headers=headers, timeout=timeout, ssl=False) as response:
+            if response.status != 200:
+                logging.warning(f"HTTP {response.status} for {source['link']}")
+                return source.get('snippet', ''), source_info
+            content_type = response.headers.get('Content-Type', '').lower()
+            if 'text/html' not in content_type:
+                logging.info(f"Non-HTML content at {source['link']} (type: {content_type})")
+                return source.get('snippet', ''), source_info
             html = await response.text()
             soup = BeautifulSoup(html, "html.parser")
+            # Remove unwanted elements
+            for tag in soup(['script', 'style', 'nav', 'footer', 'header', 'aside', 'iframe', 'noscript', 'form']):
+                tag.decompose()
+            # Try to find main content by common patterns
+            main_content = None
+            selectors_to_try = [
+                'main',
+                'article',
+                '[role="main"]',
+                '.main-content',
+                '.content',
+                '.article-body',
+                '.post-content',
+                '.entry-content',
+                '#content'
+            ]
+            for selector in selectors_to_try:
+                main_content = soup.select_one(selector)
+                if main_content:
+                    break
+            if not main_content:
+                # If no main content found, try to find the largest text block
+                all_elements = soup.find_all()
+                # Filter out elements that are likely not main content
+                candidates = [el for el in all_elements if el.name not in ['script', 'style', 'nav', 'footer', 'header']]
+                if candidates:
+                    # Sort by text length
+                    candidates.sort(key=lambda x: len(x.get_text()), reverse=True)
+                    main_content = candidates[0] if candidates else soup
+            if not main_content:
+                main_content = soup.find('body') or soup
+            # Clean up the content
+            content = " ".join(main_content.stripped_strings)
+            content = re.sub(r'\s+', ' ', content).strip()
+            # If content is too short, try alternative extraction methods
+            if len(content.split()) < 50 and len(html) > 10000:
+                # Try extracting all paragraphs
+                paras = soup.find_all('p')
+                content = " ".join([p.get_text() for p in paras if p.get_text().strip()])
+                content = re.sub(r'\s+', ' ', content).strip()
+                # If still too short, try getting all text nodes
+                if len(content.split()) < 50:
+                    content = " ".join(soup.stripped_strings)
+                    content = re.sub(r'\s+', ' ', content).strip()
+            if len(content.split()) < 30:  # Minimum threshold for useful content
+                logging.warning(f"Very little content extracted from {source['link']}")
+                return source.get('snippet', ''), source_info
+            source_info['word_count'] = len(content.split())
+            source_info['processing_time'] = time.time() - start_time
+            return content, source_info
+    except asyncio.TimeoutError:
+        logging.warning(f"Timeout while processing {source['link']}")
+        return source.get('snippet', ''), source_info
     except Exception as e:
+        logging.warning(f"Error processing {source['link']}: {str(e)[:200]}")
+        return source.get('snippet', ''), source_info
+async def generate_research_plan(query: str, session: aiohttp.ClientSession) -> List[str]:
+    """Generate a comprehensive research plan with sub-questions."""
+    try:
+        plan_prompt = {
+            "model": LLM_MODEL,
+            "messages": [{
+                "role": "user",
+                "content": f"""Generate 4-5 focused sub-questions for in-depth research on '{query}'.
+                The questions should cover different aspects and perspectives of the topic.
+                Ensure the questions are specific enough to guide web searches effectively.
+                Your response MUST be ONLY the raw JSON array with no additional text.
+                Example: ["What is the historical background of X?", "What are the current trends in X?"]"""
+            }],
+            "temperature": 0.7,
+            "max_tokens": 300
+        }
+        async with session.post(LLM_API_URL, headers=LLM_HEADERS, json=plan_prompt, timeout=30) as response:
+            response.raise_for_status()
+            result = await response.json()
+            if isinstance(result, list):
+                return result
+            elif isinstance(result, dict) and 'choices' in result:
+                content = result['choices'][0]['message']['content']
+                sub_questions = extract_json_from_llm_response(content)
+                if sub_questions and isinstance(sub_questions, list):
+                    # Clean up the questions
+                    cleaned = []
+                    for q in sub_questions:
+                        if isinstance(q, str) and q.strip():
+                            cleaned_q = re.sub(r'^[^a-zA-Z0-9]*|[^a-zA-Z0-9]*$', '', q)
+                            if cleaned_q:
+                                cleaned.append(cleaned_q)
+                    return cleaned[:5]  # Limit to 5 questions max
+        # Fallback if we couldn't get good questions from LLM
+        default_questions = [
+            f"What is {query} and its key characteristics?",
+            f"What are the main aspects or components of {query}?",
+            f"What is the history and development of {query}?",
+            f"What are the current trends or recent developments in {query}?",
+            f"What are common challenges or controversies related to {query}?"
+        ]
+        return default_questions[:4]
+    except Exception as e:
+        logging.error(f"Failed to generate research plan: {e}")
+        return [
+            f"What is {query}?",
+            f"What are the key features of {query}?",
+            f"What is the history of {query}?",
+            f"What are current developments in {query}?"
+        ]
 async def run_deep_research_stream(query: str) -> AsyncGenerator[str, None]:
+    def format_sse(data: dict) -> str:
+        return f"data: {json.dumps(data)}\n\n"
+    start_time = time.time()
+    processed_sources = 0
+    successful_sources = 0
+    total_tokens = 0
     try:
+        # Initialize the SSE stream with start message
+        yield format_sse({
+            "event": "status",
+            "data": f"Starting deep research on '{query}'. Target completion time: 2-3 minutes."
+        })
+        async with aiohttp.ClientSession() as session:
+            # Step 1: Generate research plan
+            yield format_sse({"event": "status", "data": "Generating comprehensive research plan..."})
+            sub_questions = await generate_research_plan(query, session)
             yield format_sse({"event": "plan", "data": sub_questions})
+            # Step 2: Search for sources for each sub-question
+            yield format_sse({
+                "event": "status",
+                "data": f"Searching for sources across {len(sub_questions)} research topics..."
+            })
+            all_search_results = []
+            for sub_question in sub_questions:
+                try:
+                    # Add delay between searches to be polite
+                    if len(all_search_results) > 0:
+                        await asyncio.sleep(REQUEST_DELAY)
+                    results = await fetch_search_results(sub_question, max_results=3)
+                    if results:
+                        all_search_results.extend(results)
+                        yield format_sse({
+                            "event": "status",
+                            "data": f"Found {len(results)} sources for question: '{sub_question[:60]}...'"
+                        })
+                    else:
+                        yield format_sse({
+                            "event": "warning",
+                            "data": f"No search results found for: '{sub_question[:60]}...'"
+                        })
+                except Exception as e:
+                    logging.error(f"Search failed for '{sub_question}': {e}")
+                    yield format_sse({
+                        "event": "warning",
+                        "data": f"Search failed for one sub-topic: {str(e)[:100]}"
+                    })
+            if not all_search_results:
+                yield format_sse({
+                    "event": "error",
+                    "data": "No search results found. Check your query and try again."
+                })
+                return
+            # Deduplicate results by URL
+            unique_sources = []
+            seen_urls = set()
+            for result in all_search_results:
+                if result['link'] not in seen_urls:
+                    seen_urls.add(result['link'])
+                    unique_sources.append(result)
+            # Limit to max sources we want to process
+            unique_sources = unique_sources[:MAX_SOURCES_TO_PROCESS]
+            yield format_sse({
+                "event": "status",
+                "data": f"Found {len(unique_sources)} unique sources to process."
+            })
+            # If we have no sources, return early
             if not unique_sources:
+                yield format_sse({
+                    "event": "error",
+                    "data": "No valid sources found after deduplication."
+                })
+                return
+            # Step 3: Process sources with concurrency control
+            semaphore = asyncio.Semaphore(MAX_CONCURRENT_REQUESTS)
+            consolidated_context = ""
+            all_sources_used = []
+            processing_errors = 0
+            async def process_with_semaphore(source):
+                async with semaphore:
+                    return await process_web_source(session, source, timeout=20)
+            # Process sources with progress updates
+            processing_tasks = []
+            for i, source in enumerate(unique_sources):
+                # Check if we're running out of time
+                elapsed = time.time() - start_time
+                if elapsed > RESEARCH_TIMEOUT * 0.7:  # Leave 30% of time for synthesis
+                    yield format_sse({
+                        "event": "status",
+                        "data": f"Approaching time limit, stopping source processing at {i}/{len(unique_sources)}"
+                    })
+                    break
+                # Add delay between processing each source to be polite
+                if i > 0:
+                    await asyncio.sleep(REQUEST_DELAY * 0.5)  # Shorter delay between same-domain requests
+                task = asyncio.create_task(process_with_semaphore(source))
+                processing_tasks.append(task)
+                # Yield progress updates periodically
+                if (i + 1) % 2 == 0 or (i + 1) == len(unique_sources):
+                    yield format_sse({
+                        "event": "status",
+                        "data": f"Processed {min(i+1, len(unique_sources))}/{len(unique_sources)} sources..."
+                    })
+            # Process completed tasks as they finish
+            for future in asyncio.as_completed(processing_tasks):
+                processed_sources += 1
+                content, source_info = await future
                 if content and content.strip():
+                    # Add source content to our consolidated context
                     consolidated_context += f"Source: {source_info['link']}\nContent: {content}\n\n---\n\n"
                     all_sources_used.append(source_info)
+                    successful_sources += 1
+                    total_tokens += len(content.split())  # Rough token count
+                else:
+                    processing_errors += 1
             if not consolidated_context.strip():
+                yield format_sse({
+                    "event": "error",
+                    "data": f"Failed to extract content from any sources. {processing_errors} errors occurred."
+                })
+                return
+            # Step 4: Synthesize report with improved prompt
+            time_remaining = max(0, RESEARCH_TIMEOUT - (time.time() - start_time))
+            yield format_sse({
+                "event": "status",
+                "data": f"Synthesizing report with content from {successful_sources} sources..."
+            })
+            # Estimate how many tokens we can generate based on remaining time
+            max_output_tokens = min(1500, int(time_remaining * 5))
+            report_prompt = f"""Compose a comprehensive research report on "{query}".
+            Structure the report with clear sections based on the research questions.
+            Use markdown formatting for headings, lists, and emphasis.
+            Key requirements:
+            1. Start with an introduction that explains what {query} is and why it's important
+            2. Include well-organized sections with clear headings based on the research questions
+            3. Cite specific information from sources where appropriate
+            4. End with a conclusion that summarizes key findings and insights
+            5. Keep the report concise but comprehensive
+            Available information (summarized from {successful_sources} sources):
+            {consolidated_context[:18000]}  # Increased context size but still limited
+            Generate a report that is approximately {max_output_tokens//4} words long (about {max_output_tokens//4//200} paragraphs).
+            Focus on the most important and relevant information.
+            """
+            report_payload = {
+                "model": LLM_MODEL,
+                "messages": [{"role": "user", "content": report_prompt}],
+                "stream": True,
+                "max_tokens": max_output_tokens
+            }
+            # Stream the report generation
             async with session.post(LLM_API_URL, headers=LLM_HEADERS, json=report_payload) as response:
                 response.raise_for_status()
                 async for line in response.content:
+                    # Check if we're running out of time
+                    if time.time() - start_time > RESEARCH_TIMEOUT:
+                        yield format_sse({
+                            "event": "warning",
+                            "data": "Time limit reached, ending report generation early."
+                        })
+                        break
                     line_str = line.decode('utf-8').strip()
+                    if line_str.startswith('data:'):
+                        line_str = line_str[5:].strip()
+                    if line_str == "[DONE]":
+                        break
                     try:
                         chunk = json.loads(line_str)
                         choices = chunk.get("choices")
                         if choices and isinstance(choices, list) and len(choices) > 0:
                             content = choices[0].get("delta", {}).get("content")
+                            if content:
+                                yield format_sse({"event": "chunk", "data": content})
+                    except json.JSONDecodeError:
+                        continue
+                    except Exception as e:
+                        logging.warning(f"Error processing stream chunk: {e}")
+                        continue
+            # Final status update
+            duration = time.time() - start_time
+            stats = {
+                "total_time_seconds": round(duration),
+                "sources_processed": processed_sources,
+                "sources_successful": successful_sources,
+                "estimated_tokens": total_tokens,
+                "sources_used": len(all_sources_used)
+            }
+            yield format_sse({
+                "event": "status",
+                "data": f"Research completed successfully in {duration:.1f} seconds."
+            })
+            yield format_sse({"event": "stats", "data": stats})
             yield format_sse({"event": "sources", "data": all_sources_used})
+    except asyncio.TimeoutError:
+        yield format_sse({
+            "event": "error",
+            "data": f"Research process timed out after {RESEARCH_TIMEOUT} seconds."
+        })
     except Exception as e:
+        logging.error(f"Critical error in research process: {e}", exc_info=True)
+        yield format_sse({
+            "event": "error",
+            "data": f"An unexpected error occurred: {str(e)[:200]}"
+        })
+    finally:
+        duration = time.time() - start_time
+        yield format_sse({
+            "event": "complete",
+            "data": f"Research process finished after {duration:.1f} seconds."
+        })
 @app.post("/deep-research", response_class=StreamingResponse)
 async def deep_research_endpoint(request: DeepResearchRequest):
+    """Endpoint for deep research that streams SSE responses."""
+    if not request.query or len(request.query.strip()) < 3:
+        raise HTTPException(status_code=400, detail="Query must be at least 3 characters long")
+    return StreamingResponse(
+        run_deep_research_stream(request.query.strip()),
+        media_type="text/event-stream"
+    )
 if __name__ == "__main__":
     import uvicorn
+    uvicorn.run(app, host="0.0.0.0", port=8000)