Spaces:

rkihacker
/

Scrap

Paused

App Files Files Community

rkihacker commited on Sep 17

Commit

4d61bdb

verified ·

1 Parent(s): b7afcad

Update main.py

Browse files

Files changed (1) hide show

main.py +301 -128

main.py CHANGED Viewed

@@ -6,7 +6,7 @@ import random
 import re
 import time
 from typing import AsyncGenerator, Optional, Tuple, List, Dict
-from urllib.parse import quote_plus
 from fastapi import FastAPI, HTTPException
 from fastapi.responses import StreamingResponse
 from fastapi.middleware.cors import CORSMiddleware
@@ -15,6 +15,7 @@ from dotenv import load_dotenv
 import aiohttp
 from bs4 import BeautifulSoup
 from fake_useragent import UserAgent
 # --- Configuration ---
 logging.basicConfig(
@@ -33,16 +34,17 @@ else:
 # --- Constants & Headers ---
 LLM_API_URL = "https://api.typegpt.net/v1/chat/completions"
 LLM_MODEL = "meta-llama/Llama-4-Maverick-17B-128E-Instruct-FP8"
-MAX_SOURCES_TO_PROCESS = 6
-MAX_CONCURRENT_REQUESTS = 3
-RESEARCH_TIMEOUT = 180  # 3 minutes maximum
-REQUEST_DELAY = 2.0
 # Initialize fake user agent generator
 try:
     ua = UserAgent()
 except:
-    # Fallback if fake_useragent isn't available
     class SimpleUA:
         def random(self):
             return random.choice([
@@ -60,11 +62,12 @@ LLM_HEADERS = {
 class DeepResearchRequest(BaseModel):
     query: str
 app = FastAPI(
     title="AI Deep Research API",
-    description="Provides robust, long-form, streaming deep research completions using real web searches.",
-    version="2.1.0"
 )
 app.add_middleware(
     CORSMiddleware,
@@ -88,11 +91,46 @@ async def get_real_user_agent() -> str:
     """Get a realistic user agent string."""
     try:
         if isinstance(ua, UserAgent):
-            return ua.random
         return ua.random()  # For our fallback class
     except:
         return "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/128.0.0.0 Safari/537.36"
 async def check_robots_txt(url: str) -> bool:
     """Check if scraping is allowed by robots.txt."""
     try:
@@ -121,7 +159,7 @@ async def check_robots_txt(url: str) -> bool:
 async def fetch_search_results(query: str, max_results: int = 5) -> List[dict]:
     """
-    Perform a real search using DuckDuckGo's HTML interface.
     """
     try:
         search_url = f"https://html.duckduckgo.com/html/?q={quote_plus(query)}"
@@ -143,37 +181,41 @@ async def fetch_search_results(query: str, max_results: int = 5) -> List[dict]:
                 soup = BeautifulSoup(html, 'html.parser')
                 results = []
-                # Updated selectors for DuckDuckGo's current HTML structure
-                for result in soup.select('.result__body')[:max_results]:
-                    try:
-                        title_elem = result.select_one('.result__title .result__a')
-                        link_elem = title_elem if title_elem else result.select_one('a')
-                        snippet_elem = result.select_one('.result__snippet')
-                        if title_elem and link_elem and snippet_elem:
-                            # Handle DuckDuckGo's redirect URLs
-                            link = link_elem['href']
-                            if link.startswith('/l/'):
-                                redirect_url = f"https://duckduckgo.com{link}"
-                                try:
-                                    async with session.get(redirect_url, headers=headers, timeout=5, allow_redirects=False) as redirect_resp:
-                                        if redirect_resp.status == 302:
-                                            link = redirect_resp.headers.get('Location', link)
-                                except Exception as e:
-                                    logging.warning(f"Could not follow redirect for {link}: {e}")
-                                    continue
                             results.append({
                                 'title': title_elem.get_text(strip=True),
-                                'link': link,
-                                'snippet': snippet_elem.get_text(strip=True)
                             })
-                    except Exception as e:
-                        logging.warning(f"Error parsing search result: {e}")
-                        continue
                 logging.info(f"Found {len(results)} real search results for '{query}'")
-                return results
     except Exception as e:
         logging.error(f"Real search failed: {e}")
         return []
@@ -184,32 +226,37 @@ async def process_web_source(session: aiohttp.ClientSession, source: dict, timeo
     """
     headers = {'User-Agent': await get_real_user_agent()}
     source_info = source.copy()
     # Check robots.txt first
-    if not await check_robots_txt(source['link']):
-        logging.info(f"Scraping disallowed by robots.txt for {source['link']}")
         return source.get('snippet', ''), source_info
     try:
-        logging.info(f"Processing source: {source['link']}")
         start_time = time.time()
         # Skip non-HTML content
-        if any(source['link'].lower().endswith(ext) for ext in ['.pdf', '.doc', '.docx', '.ppt', '.pptx', '.xls', '.xlsx']):
-            logging.info(f"Skipping non-HTML content at {source['link']}")
             return source.get('snippet', ''), source_info
         # Add delay between requests to be polite
         await asyncio.sleep(REQUEST_DELAY)
-        async with session.get(source['link'], headers=headers, timeout=timeout, ssl=False) as response:
             if response.status != 200:
-                logging.warning(f"HTTP {response.status} for {source['link']}")
                 return source.get('snippet', ''), source_info
             content_type = response.headers.get('Content-Type', '').lower()
             if 'text/html' not in content_type:
-                logging.info(f"Non-HTML content at {source['link']} (type: {content_type})")
                 return source.get('snippet', ''), source_info
             html = await response.text()
@@ -229,7 +276,10 @@ async def process_web_source(session: aiohttp.ClientSession, source: dict, timeo
                 '.article-body',
                 '.post-content',
                 '.entry-content',
-                '#content'
             ]
             main_content = None
@@ -265,8 +315,21 @@ async def process_web_source(session: aiohttp.ClientSession, source: dict, timeo
                     content = " ".join(soup.stripped_strings)
                     content = re.sub(r'\s+', ' ', content).strip()
             if len(content.split()) < 30:
-                logging.warning(f"Very little content extracted from {source['link']}")
                 return source.get('snippet', ''), source_info
             source_info['word_count'] = len(content.split())
@@ -274,10 +337,10 @@ async def process_web_source(session: aiohttp.ClientSession, source: dict, timeo
             return content, source_info
     except asyncio.TimeoutError:
-        logging.warning(f"Timeout while processing {source['link']}")
         return source.get('snippet', ''), source_info
     except Exception as e:
-        logging.warning(f"Error processing {source['link']}: {str(e)[:200]}")
         return source.get('snippet', ''), source_info
 async def generate_research_plan(query: str, session: aiohttp.ClientSession) -> List[str]:
@@ -287,9 +350,8 @@ async def generate_research_plan(query: str, session: aiohttp.ClientSession) ->
             "model": LLM_MODEL,
             "messages": [{
                 "role": "user",
-                "content": f"""Generate 4-5 focused sub-questions for in-depth research on '{query}'.
-                The questions should cover different aspects and perspectives of the topic.
-                Ensure the questions are specific enough to guide web searches effectively.
                 Your response MUST be ONLY the raw JSON array with no additional text.
                 Example: ["What is the historical background of X?", "What are the current trends in X?"]"""
             }],
@@ -313,26 +375,154 @@ async def generate_research_plan(query: str, session: aiohttp.ClientSession) ->
                             cleaned_q = re.sub(r'^[^a-zA-Z0-9]*|[^a-zA-Z0-9]*$', '', q)
                             if cleaned_q:
                                 cleaned.append(cleaned_q)
-                    return cleaned[:5]
         # Fallback if we couldn't get good questions from LLM
         return [
-            f"What is {query} and its key characteristics?",
-            f"What are the main aspects or components of {query}?",
-            f"What is the history and development of {query}?",
-            f"What are the current trends or recent developments in {query}?",
-            f"What are common challenges or controversies related to {query}?"
         ]
     except Exception as e:
         logging.error(f"Failed to generate research plan: {e}")
         return [
             f"What is {query}?",
-            f"What are the key features of {query}?",
-            f"What is the history of {query}?",
-            f"What are current developments in {query}?"
         ]
-async def run_deep_research_stream(query: str) -> AsyncGenerator[str, None]:
     def format_sse(data: dict) -> str:
         return f"data: {json.dumps(data)}\n\n"
@@ -345,7 +535,7 @@ async def run_deep_research_stream(query: str) -> AsyncGenerator[str, None]:
         # Initialize the SSE stream with start message
         yield format_sse({
             "event": "status",
-            "data": f"Starting deep research on '{query}'. Target completion time: 2-3 minutes."
         })
         async with aiohttp.ClientSession() as session:
@@ -354,68 +544,40 @@ async def run_deep_research_stream(query: str) -> AsyncGenerator[str, None]:
             sub_questions = await generate_research_plan(query, session)
             yield format_sse({"event": "plan", "data": sub_questions})
-            # Step 2: Search for sources for each sub-question
             yield format_sse({
                 "event": "status",
-                "data": f"Searching for sources across {len(sub_questions)} research topics..."
             })
-            all_search_results = []
-            for sub_question in sub_questions:
-                try:
-                    # Add delay between searches to be polite
-                    if len(all_search_results) > 0:
-                        await asyncio.sleep(REQUEST_DELAY)
-                    results = await fetch_search_results(sub_question, max_results=3)
-                    if results:
-                        all_search_results.extend(results)
-                        yield format_sse({
-                            "event": "status",
-                            "data": f"Found {len(results)} sources for question: '{sub_question[:60]}...'"
-                        })
-                    else:
-                        yield format_sse({
-                            "event": "warning",
-                            "data": f"No search results found for: '{sub_question[:60]}...'"
-                        })
-                except Exception as e:
-                    logging.error(f"Search failed for '{sub_question}': {e}")
-                    yield format_sse({
-                        "event": "warning",
-                        "data": f"Search failed for one sub-topic: {str(e)[:100]}"
-                    })
-            if not all_search_results:
                 yield format_sse({
                     "event": "error",
                     "data": "No search results found. Check your query and try again."
                 })
                 return
-            # Deduplicate results by URL
-            unique_sources = []
-            seen_urls = set()
-            for result in all_search_results:
-                if result['link'] not in seen_urls:
-                    seen_urls.add(result['link'])
-                    unique_sources.append(result)
-            # Limit to max sources we want to process
-            unique_sources = unique_sources[:MAX_SOURCES_TO_PROCESS]
             yield format_sse({
                 "event": "status",
-                "data": f"Found {len(unique_sources)} unique sources to process."
             })
-            if not unique_sources:
                 yield format_sse({
                     "event": "error",
-                    "data": "No valid sources found after deduplication."
                 })
                 return
-            # Step 3: Process sources with concurrency control
             semaphore = asyncio.Semaphore(MAX_CONCURRENT_REQUESTS)
             consolidated_context = ""
             all_sources_used = []
@@ -427,13 +589,13 @@ async def run_deep_research_stream(query: str) -> AsyncGenerator[str, None]:
             # Process sources with progress updates
             processing_tasks = []
-            for i, source in enumerate(unique_sources):
                 # Check if we're running out of time
                 elapsed = time.time() - start_time
-                if elapsed > RESEARCH_TIMEOUT * 0.7:
                     yield format_sse({
                         "event": "status",
-                        "data": f"Approaching time limit, stopping source processing at {i}/{len(unique_sources)}"
                     })
                     break
@@ -444,10 +606,10 @@ async def run_deep_research_stream(query: str) -> AsyncGenerator[str, None]:
                 task = asyncio.create_task(process_with_semaphore(source))
                 processing_tasks.append(task)
-                if (i + 1) % 2 == 0 or (i + 1) == len(unique_sources):
                     yield format_sse({
                         "event": "status",
-                        "data": f"Processed {min(i+1, len(unique_sources))}/{len(unique_sources)} sources..."
                     })
             # Process completed tasks as they finish
@@ -458,7 +620,7 @@ async def run_deep_research_stream(query: str) -> AsyncGenerator[str, None]:
                     consolidated_context += f"Source: {source_info['link']}\nContent: {content}\n\n---\n\n"
                     all_sources_used.append(source_info)
                     successful_sources += 1
-                    total_tokens += len(content.split())
                 else:
                     processing_errors += 1
@@ -469,30 +631,38 @@ async def run_deep_research_stream(query: str) -> AsyncGenerator[str, None]:
                 })
                 return
-            # Step 4: Synthesize report
-            time_remaining = max(0, RESEARCH_TIMEOUT - (time.time() - start_time))
             yield format_sse({
                 "event": "status",
-                "data": f"Synthesizing report with content from {successful_sources} sources..."
             })
-            max_output_tokens = min(1500, int(time_remaining * 5))
-            report_prompt = f"""Compose a comprehensive research report on "{query}".
-            Structure the report with clear sections based on the research questions.
-            Use markdown formatting for headings, lists, and emphasis.
-            Key requirements:
-            1. Start with an introduction that explains what {query} is and why it's important
-            2. Include well-organized sections with clear headings
-            3. Cite specific information from sources where appropriate
-            4. End with a conclusion that summarizes key findings and insights
-            5. Keep the report concise but comprehensive
-            Available information (summarized from {successful_sources} sources):
-            {consolidated_context[:18000]}
-            Generate a report that is approximately {max_output_tokens//4} words long.
             """
             report_payload = {
@@ -502,10 +672,11 @@ async def run_deep_research_stream(query: str) -> AsyncGenerator[str, None]:
                 "max_tokens": max_output_tokens
             }
             async with session.post(LLM_API_URL, headers=LLM_HEADERS, json=report_payload) as response:
                 response.raise_for_status()
                 async for line in response.content:
-                    if time.time() - start_time > RESEARCH_TIMEOUT:
                         yield format_sse({
                             "event": "warning",
                             "data": "Time limit reached, ending report generation early."
@@ -528,6 +699,7 @@ async def run_deep_research_stream(query: str) -> AsyncGenerator[str, None]:
                         logging.warning(f"Error processing stream chunk: {e}")
                         continue
             duration = time.time() - start_time
             stats = {
                 "total_time_seconds": round(duration),
@@ -546,7 +718,7 @@ async def run_deep_research_stream(query: str) -> AsyncGenerator[str, None]:
     except asyncio.TimeoutError:
         yield format_sse({
             "event": "error",
-            "data": f"Research process timed out after {RESEARCH_TIMEOUT} seconds."
         })
     except Exception as e:
         logging.error(f"Critical error in research process: {e}", exc_info=True)
@@ -567,8 +739,9 @@ async def deep_research_endpoint(request: DeepResearchRequest):
     if not request.query or len(request.query.strip()) < 3:
         raise HTTPException(status_code=400, detail="Query must be at least 3 characters long")
     return StreamingResponse(
-        run_deep_research_stream(request.query.strip()),
         media_type="text/event-stream"
     )

 import re
 import time
 from typing import AsyncGenerator, Optional, Tuple, List, Dict
+from urllib.parse import quote_plus, urlparse
 from fastapi import FastAPI, HTTPException
 from fastapi.responses import StreamingResponse
 from fastapi.middleware.cors import CORSMiddleware
 import aiohttp
 from bs4 import BeautifulSoup
 from fake_useragent import UserAgent
+from collections import defaultdict
 # --- Configuration ---
 logging.basicConfig(
 # --- Constants & Headers ---
 LLM_API_URL = "https://api.typegpt.net/v1/chat/completions"
 LLM_MODEL = "meta-llama/Llama-4-Maverick-17B-128E-Instruct-FP8"
+MAX_SOURCES_TO_PROCESS = 10  # Increased to get more comprehensive results
+MAX_CONCURRENT_REQUESTS = 5   # Increased for faster processing
+SEARCH_TIMEOUT = 120          # 2 minutes for searching (adjustable)
+TOTAL_TIMEOUT = 180           # 3 minutes total
+REQUEST_DELAY = 1.0           # Shorter delay between requests
+USER_AGENT_ROTATION = True
 # Initialize fake user agent generator
 try:
     ua = UserAgent()
 except:
     class SimpleUA:
         def random(self):
             return random.choice([
 class DeepResearchRequest(BaseModel):
     query: str
+    search_time: int = 120  # Default to 2 minutes
 app = FastAPI(
     title="AI Deep Research API",
+    description="Provides comprehensive research reports from real web searches within 1-2 minutes.",
+    version="3.0.0"
 )
 app.add_middleware(
     CORSMiddleware,
     """Get a realistic user agent string."""
     try:
         if isinstance(ua, UserAgent):
+            return ua.random()
         return ua.random()  # For our fallback class
     except:
         return "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/128.0.0.0 Safari/537.36"
+def clean_url(url: str) -> str:
+    """Clean up and normalize URLs."""
+    if not url:
+        return ""
+    # Handle DuckDuckGo redirect URLs
+    if url.startswith('//duckduckgo.com/l/'):
+        url = f"https:{url}"  # Make it a proper URL
+        try:
+            # Extract the real URL from DuckDuckGo's redirect
+            parsed = urlparse(url)
+            query_params = parsed.query
+            if 'uddg=' in query_params:
+                # Extract the actual URL from the parameter
+                match = re.search(r'uddg=([^&]+)', query_params)
+                if match:
+                    encoded_url = match.group(1)
+                    try:
+                        url = quote_plus(encoded_url)  # This might need better decoding
+                        # For simplicity, we'll just return the decoded URL
+                        # In production, you'd want to properly URL-decode this
+                        return encoded_url
+                    except:
+                        pass
+        except:
+            pass
+    # Ensure URL has proper scheme
+    if url.startswith('//'):
+        url = 'https:' + url
+    elif not url.startswith(('http://', 'https://')):
+        url = 'https://' + url
+    return url
 async def check_robots_txt(url: str) -> bool:
     """Check if scraping is allowed by robots.txt."""
     try:
 async def fetch_search_results(query: str, max_results: int = 5) -> List[dict]:
     """
+    Perform a real search using DuckDuckGo's HTML interface with improved URL handling.
     """
     try:
         search_url = f"https://html.duckduckgo.com/html/?q={quote_plus(query)}"
                 soup = BeautifulSoup(html, 'html.parser')
                 results = []
+                # Try multiple selectors as DuckDuckGo may change their HTML structure
+                for selector in ['.result__body', '.result__a', '.result']:
+                    if len(results) >= max_results:
+                        break
+                    for result in soup.select(selector)[:max_results]:
+                        try:
+                            title_elem = result.select_one('.result__title .result__a') or result.select_one('.result__a')
+                            if not title_elem:
+                                continue
+                            link = title_elem['href']
+                            snippet_elem = result.select_one('.result__snippet')
+                            # Clean the URL
+                            clean_link = clean_url(link)
+                            # Skip if we couldn't get a clean URL
+                            if not clean_link or clean_link.startswith('javascript:'):
+                                continue
+                            # Get snippet if available
+                            snippet = snippet_elem.get_text(strip=True) if snippet_elem else ""
                             results.append({
                                 'title': title_elem.get_text(strip=True),
+                                'link': clean_link,
+                                'snippet': snippet
                             })
+                        except Exception as e:
+                            logging.warning(f"Error parsing search result: {e}")
+                            continue
                 logging.info(f"Found {len(results)} real search results for '{query}'")
+                return results[:max_results]
     except Exception as e:
         logging.error(f"Real search failed: {e}")
         return []
     """
     headers = {'User-Agent': await get_real_user_agent()}
     source_info = source.copy()
+    source_info['link'] = clean_url(source['link'])  # Ensure URL is clean
+    # Skip if URL is invalid
+    if not source_info['link'] or not source_info['link'].startswith(('http://', 'https://')):
+        return source.get('snippet', ''), source_info
     # Check robots.txt first
+    if not await check_robots_txt(source_info['link']):
+        logging.info(f"Scraping disallowed by robots.txt for {source_info['link']}")
         return source.get('snippet', ''), source_info
     try:
+        logging.info(f"Processing source: {source_info['link']}")
         start_time = time.time()
         # Skip non-HTML content
+        if any(source_info['link'].lower().endswith(ext) for ext in ['.pdf', '.doc', '.docx', '.ppt', '.pptx', '.xls', '.xlsx']):
+            logging.info(f"Skipping non-HTML content at {source_info['link']}")
             return source.get('snippet', ''), source_info
         # Add delay between requests to be polite
         await asyncio.sleep(REQUEST_DELAY)
+        async with session.get(source_info['link'], headers=headers, timeout=timeout, ssl=False) as response:
             if response.status != 200:
+                logging.warning(f"HTTP {response.status} for {source_info['link']}")
                 return source.get('snippet', ''), source_info
             content_type = response.headers.get('Content-Type', '').lower()
             if 'text/html' not in content_type:
+                logging.info(f"Non-HTML content at {source_info['link']} (type: {content_type})")
                 return source.get('snippet', ''), source_info
             html = await response.text()
                 '.article-body',
                 '.post-content',
                 '.entry-content',
+                '#content',
+                '#main',
+                '.main',
+                '.article'
             ]
             main_content = None
                     content = " ".join(soup.stripped_strings)
                     content = re.sub(r'\s+', ' ', content).strip()
+            # If content is still too short, try to extract from specific tags
             if len(content.split()) < 30:
+                # Try to get content from divs with certain classes
+                for tag in ['div', 'section', 'article']:
+                    for element in soup.find_all(tag):
+                        if len(element.get_text().split()) > 200:  # If this element has substantial content
+                            content = " ".join(element.stripped_strings)
+                            content = re.sub(r'\s+', ' ', content).strip()
+                            if len(content.split()) >= 30:  # If we got enough content
+                                break
+                    if len(content.split()) >= 30:
+                        break
+            if len(content.split()) < 30:
+                logging.warning(f"Very little content extracted from {source_info['link']}")
                 return source.get('snippet', ''), source_info
             source_info['word_count'] = len(content.split())
             return content, source_info
     except asyncio.TimeoutError:
+        logging.warning(f"Timeout while processing {source_info['link']}")
         return source.get('snippet', ''), source_info
     except Exception as e:
+        logging.warning(f"Error processing {source_info['link']}: {str(e)[:200]}")
         return source.get('snippet', ''), source_info
 async def generate_research_plan(query: str, session: aiohttp.ClientSession) -> List[str]:
             "model": LLM_MODEL,
             "messages": [{
                 "role": "user",
+                "content": f"""Generate 4-6 comprehensive sub-questions for in-depth research on '{query}'.
+                Focus on key aspects that would provide a complete understanding of the topic.
                 Your response MUST be ONLY the raw JSON array with no additional text.
                 Example: ["What is the historical background of X?", "What are the current trends in X?"]"""
             }],
                             cleaned_q = re.sub(r'^[^a-zA-Z0-9]*|[^a-zA-Z0-9]*$', '', q)
                             if cleaned_q:
                                 cleaned.append(cleaned_q)
+                    return cleaned[:6]  # Limit to 6 questions max
         # Fallback if we couldn't get good questions from LLM
         return [
+            f"What is {query} and its key features?",
+            f"How does {query} compare to alternatives?",
+            f"What are the current developments in {query}?",
+            f"What are the main challenges with {query}?",
+            f"What does the future hold for {query}?"
         ]
     except Exception as e:
         logging.error(f"Failed to generate research plan: {e}")
         return [
             f"What is {query}?",
+            f"What are the key aspects of {query}?",
+            f"What are current trends in {query}?",
+            f"What are the challenges with {query}?"
         ]
+async def continuous_search(query: str, search_time: int = 120) -> List[dict]:
+    """
+    Perform continuous searching for better results within time constraints.
+    """
+    start_time = time.time()
+    all_results = []
+    seen_urls = set()
+    # Generate multiple variations of the query
+    query_variations = [
+        query,
+        f"{query} comparison",
+        f"{query} analysis",
+        f"{query} review",
+        f"{query} features",
+        f"{query} vs alternatives"
+    ]
+    async with aiohttp.ClientSession() as session:
+        while time.time() - start_time < search_time:
+            # Shuffle the query variations to get diverse results
+            random.shuffle(query_variations)
+            for q in query_variations[:3]:  # Only use first 3 variations in each iteration
+                if time.time() - start_time >= search_time:
+                    break
+                try:
+                    results = await fetch_search_results(q, max_results=5)
+                    for result in results:
+                        clean_link = clean_url(result['link'])
+                        if clean_link and clean_link not in seen_urls:
+                            seen_urls.add(clean_link)
+                            result['link'] = clean_link
+                            all_results.append(result)
+                            logging.info(f"Found new result: {result['title']}")
+                    # Small delay between searches
+                    await asyncio.sleep(1.0)
+                    # If we have enough unique results, we can stop early
+                    if len(all_results) >= MAX_SOURCES_TO_PROCESS * 1.5:  # Get more than we need for selection
+                        break
+                except Exception as e:
+                    logging.error(f"Error during continuous search: {e}")
+                    await asyncio.sleep(2.0)  # Wait a bit before trying again
+    # Filter and sort results by relevance
+    if all_results:
+        # Simple relevance scoring (could be enhanced with more sophisticated methods)
+        def score_result(result):
+            # Score based on how many query terms appear in title/snippet
+            query_terms = set(query.lower().split())
+            title = result['title'].lower()
+            snippet = result['snippet'].lower()
+            matches = 0
+            for term in query_terms:
+                if term in title or term in snippet:
+                    matches += 1
+            # Also consider length of snippet as a proxy for content richness
+            snippet_length = len(result['snippet'].split())
+            return matches * 10 + snippet_length
+        # Sort by score, descending
+        all_results.sort(key=lambda x: score_result(x), reverse=True)
+    return all_results[:MAX_SOURCES_TO_PROCESS * 2]  # Return more than we need for selection
+async def filter_and_select_sources(results: List[dict]) -> List[dict]:
+    """
+    Filter and select the best sources from search results.
+    """
+    if not results:
+        return []
+    # Group by domain to ensure diversity
+    domain_counts = defaultdict(int)
+    domain_results = defaultdict(list)
+    for result in results:
+        domain = urlparse(result['link']).netloc
+        domain_counts[domain] += 1
+        domain_results[domain].append(result)
+    selected = []
+    # First pass: take the top result from each domain
+    for domain, domain_res in domain_results.items():
+        if len(selected) >= MAX_SOURCES_TO_PROCESS:
+            break
+        # Take the best result from this domain (sorted by position in original results)
+        if domain_res:
+            selected.append(domain_res[0])
+    # Second pass: if we need more, take additional results from domains with good content
+    if len(selected) < MAX_SOURCES_TO_PROCESS:
+        # Calculate average snippet length as a proxy for content quality
+        domain_quality = {}
+        for domain, domain_res in domain_results.items():
+            avg_length = sum(len(r['snippet'].split()) for r in domain_res) / len(domain_res)
+            domain_quality[domain] = avg_length
+        # Sort domains by quality
+        sorted_domains = sorted(domain_quality.items(), key=lambda x: x[1], reverse=True)
+        # Add more results from high-quality domains
+        for domain, _ in sorted_domains:
+            if len(selected) >= MAX_SOURCES_TO_PROCESS:
+                break
+            for res in domain_results[domain]:
+                if res not in selected:
+                    selected.append(res)
+                    if len(selected) >= MAX_SOURCES_TO_PROCESS:
+                        break
+    # Final pass: if still need more, add remaining high-snippet-length results
+    if len(selected) < MAX_SOURCES_TO_PROCESS:
+        all_results_sorted = sorted(results, key=lambda x: len(x['snippet'].split()), reverse=True)
+        for res in all_results_sorted:
+            if res not in selected:
+                selected.append(res)
+                if len(selected) >= MAX_SOURCES_TO_PROCESS:
+                    break
+    return selected[:MAX_SOURCES_TO_PROCESS]
+async def run_deep_research_stream(query: str, search_time: int = 120) -> AsyncGenerator[str, None]:
     def format_sse(data: dict) -> str:
         return f"data: {json.dumps(data)}\n\n"
         # Initialize the SSE stream with start message
         yield format_sse({
             "event": "status",
+            "data": f"Starting deep research on '{query}'. Search time limit: {search_time} seconds."
         })
         async with aiohttp.ClientSession() as session:
             sub_questions = await generate_research_plan(query, session)
             yield format_sse({"event": "plan", "data": sub_questions})
+            # Step 2: Continuous search for better results
             yield format_sse({
                 "event": "status",
+                "data": f"Performing continuous search for up to {search_time} seconds..."
             })
+            search_results = await continuous_search(query, search_time)
+            yield format_sse({
+                "event": "status",
+                "data": f"Found {len(search_results)} potential sources. Selecting the best ones..."
+            })
+            if not search_results:
                 yield format_sse({
                     "event": "error",
                     "data": "No search results found. Check your query and try again."
                 })
                 return
+            # Select the best sources
+            selected_sources = await filter_and_select_sources(search_results)
             yield format_sse({
                 "event": "status",
+                "data": f"Selected {len(selected_sources)} high-quality sources to process."
             })
+            if not selected_sources:
                 yield format_sse({
                     "event": "error",
+                    "data": "No valid sources found after filtering."
                 })
                 return
+            # Step 3: Process selected sources with concurrency control
             semaphore = asyncio.Semaphore(MAX_CONCURRENT_REQUESTS)
             consolidated_context = ""
             all_sources_used = []
             # Process sources with progress updates
             processing_tasks = []
+            for i, source in enumerate(selected_sources):
                 # Check if we're running out of time
                 elapsed = time.time() - start_time
+                if elapsed > TOTAL_TIMEOUT * 0.8:  # Leave 20% of time for synthesis
                     yield format_sse({
                         "event": "status",
+                        "data": f"Approaching time limit, stopping source processing at {i}/{len(selected_sources)}"
                     })
                     break
                 task = asyncio.create_task(process_with_semaphore(source))
                 processing_tasks.append(task)
+                if (i + 1) % 2 == 0 or (i + 1) == len(selected_sources):
                     yield format_sse({
                         "event": "status",
+                        "data": f"Processed {min(i+1, len(selected_sources))}/{len(selected_sources)} sources..."
                     })
             # Process completed tasks as they finish
                     consolidated_context += f"Source: {source_info['link']}\nContent: {content}\n\n---\n\n"
                     all_sources_used.append(source_info)
                     successful_sources += 1
+                    total_tokens += len(content.split())  # Rough token count
                 else:
                     processing_errors += 1
                 })
                 return
+            # Step 4: Synthesize comprehensive report
+            time_remaining = max(0, TOTAL_TIMEOUT - (time.time() - start_time))
             yield format_sse({
                 "event": "status",
+                "data": f"Synthesizing comprehensive report from {successful_sources} sources..."
             })
+            max_output_tokens = min(2000, int(time_remaining * 6))  # More aggressive token count
+            report_prompt = f"""Compose an in-depth analysis report on "{query}".
+            Structure the report with these sections:
+            1. Introduction and Background
+            2. Key Features and Capabilities
+            3. Comparative Analysis with Alternatives
+            4. Current Developments and Trends
+            5. Challenges and Limitations
+            6. Future Outlook
+            7. Conclusion and Recommendations
+            For each section, provide detailed analysis based on the source material.
+            Include specific examples and data points from the sources when available.
+            Compare and contrast different viewpoints from various sources.
+            Use markdown formatting for headings, subheadings, lists, and emphasis.
+            Cite sources where appropriate using inline citations like [1][2].
+            Available information from {successful_sources} sources:
+            {consolidated_context[:20000]}  # Increased context size
+            Generate a comprehensive report of approximately {max_output_tokens//4} words.
+            Focus on providing deep insights, analysis, and actionable information.
             """
             report_payload = {
                 "max_tokens": max_output_tokens
             }
+            # Stream the report generation
             async with session.post(LLM_API_URL, headers=LLM_HEADERS, json=report_payload) as response:
                 response.raise_for_status()
                 async for line in response.content:
+                    if time.time() - start_time > TOTAL_TIMEOUT:
                         yield format_sse({
                             "event": "warning",
                             "data": "Time limit reached, ending report generation early."
                         logging.warning(f"Error processing stream chunk: {e}")
                         continue
+            # Final status update
             duration = time.time() - start_time
             stats = {
                 "total_time_seconds": round(duration),
     except asyncio.TimeoutError:
         yield format_sse({
             "event": "error",
+            "data": f"Research process timed out after {TOTAL_TIMEOUT} seconds."
         })
     except Exception as e:
         logging.error(f"Critical error in research process: {e}", exc_info=True)
     if not request.query or len(request.query.strip()) < 3:
         raise HTTPException(status_code=400, detail="Query must be at least 3 characters long")
+    search_time = min(max(request.search_time, 60), 180)  # Clamp between 60 and 180 seconds
     return StreamingResponse(
+        run_deep_research_stream(request.query.strip(), search_time),
         media_type="text/event-stream"
     )