Spaces:

rkihacker
/

Scrap

Paused

App Files Files Community

rkihacker commited on Sep 17

Commit

58de22e

verified ·

1 Parent(s): cffab53

Update main.py

Browse files

Files changed (1) hide show

main.py +105 -150

main.py CHANGED Viewed

@@ -34,12 +34,14 @@ else:
 # --- Constants & Headers ---
 LLM_API_URL = "https://api.typegpt.net/v1/chat/completions"
 LLM_MODEL = "meta-llama/Llama-4-Maverick-17B-128E-Instruct-FP8"
-MAX_SOURCES_TO_PROCESS = 10  # Increased to get more comprehensive results
-MAX_CONCURRENT_REQUESTS = 5   # Increased for faster processing
-SEARCH_TIMEOUT = 120          # 2 minutes for searching (adjustable)
-TOTAL_TIMEOUT = 180           # 3 minutes total
-REQUEST_DELAY = 1.0           # Shorter delay between requests
 USER_AGENT_ROTATION = True
 # Initialize fake user agent generator
 try:
@@ -62,7 +64,7 @@ LLM_HEADERS = {
 class DeepResearchRequest(BaseModel):
     query: str
-    search_time: int = 120  # Default to 2 minutes
 app = FastAPI(
     title="AI Deep Research API",
@@ -91,8 +93,8 @@ async def get_real_user_agent() -> str:
     """Get a realistic user agent string."""
     try:
         if isinstance(ua, UserAgent):
-            return ua.random()
-        return ua.random()  # For our fallback class
     except:
         return "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/128.0.0.0 Safari/537.36"
@@ -103,21 +105,18 @@ def clean_url(url: str) -> str:
     # Handle DuckDuckGo redirect URLs
     if url.startswith('//duckduckgo.com/l/'):
-        url = f"https:{url}"  # Make it a proper URL
         try:
-            # Extract the real URL from DuckDuckGo's redirect
             parsed = urlparse(url)
             query_params = parsed.query
             if 'uddg=' in query_params:
-                # Extract the actual URL from the parameter
                 match = re.search(r'uddg=([^&]+)', query_params)
                 if match:
                     encoded_url = match.group(1)
                     try:
-                        url = quote_plus(encoded_url)  # This might need better decoding
-                        # For simplicity, we'll just return the decoded URL
-                        # In production, you'd want to properly URL-decode this
-                        return encoded_url
                     except:
                         pass
         except:
@@ -148,7 +147,6 @@ async def check_robots_txt(url: str) -> bool:
                     robots = await response.text()
                     if "Disallow: /" in robots:
                         return False
-                    # Check for specific path disallows
                     path = re.sub(r'https?://[^/]+', '', url)
                     if any(f"Disallow: {p}" in robots for p in [path, path.rstrip('/') + '/']):
                         return False
@@ -159,66 +157,70 @@ async def check_robots_txt(url: str) -> bool:
 async def fetch_search_results(query: str, max_results: int = 5) -> List[dict]:
     """
-    Perform a real search using DuckDuckGo's HTML interface with improved URL handling.
     """
-    try:
-        search_url = f"https://html.duckduckgo.com/html/?q={quote_plus(query)}"
-        headers = {
-            "User-Agent": await get_real_user_agent(),
-            "Accept": "text/html,application/xhtml+xml,application/xml;q=0.9,image/webp,*/*;q=0.8",
-            "Accept-Language": "en-US,en;q=0.5",
-            "Referer": "https://duckduckgo.com/",
-            "DNT": "1"
-        }
-        async with aiohttp.ClientSession() as session:
-            async with session.get(search_url, headers=headers, timeout=10) as response:
-                if response.status != 200:
-                    logging.warning(f"Search failed with status {response.status}")
-                    return []
-                html = await response.text()
-                soup = BeautifulSoup(html, 'html.parser')
-                results = []
-                # Try multiple selectors as DuckDuckGo may change their HTML structure
-                for selector in ['.result__body', '.result__a', '.result']:
-                    if len(results) >= max_results:
-                        break
-                    for result in soup.select(selector)[:max_results]:
-                        try:
-                            title_elem = result.select_one('.result__title .result__a') or result.select_one('.result__a')
-                            if not title_elem:
                                 continue
-                            link = title_elem['href']
-                            snippet_elem = result.select_one('.result__snippet')
-                            # Clean the URL
-                            clean_link = clean_url(link)
-                            # Skip if we couldn't get a clean URL
-                            if not clean_link or clean_link.startswith('javascript:'):
                                 continue
-                            # Get snippet if available
-                            snippet = snippet_elem.get_text(strip=True) if snippet_elem else ""
-                            results.append({
-                                'title': title_elem.get_text(strip=True),
-                                'link': clean_link,
-                                'snippet': snippet
-                            })
-                        except Exception as e:
-                            logging.warning(f"Error parsing search result: {e}")
-                            continue
-                logging.info(f"Found {len(results)} real search results for '{query}'")
-                return results[:max_results]
-    except Exception as e:
-        logging.error(f"Real search failed: {e}")
-        return []
 async def process_web_source(session: aiohttp.ClientSession, source: dict, timeout: int = 15) -> Tuple[str, dict]:
     """
@@ -226,13 +228,12 @@ async def process_web_source(session: aiohttp.ClientSession, source: dict, timeo
     """
     headers = {'User-Agent': await get_real_user_agent()}
     source_info = source.copy()
-    source_info['link'] = clean_url(source['link'])  # Ensure URL is clean
-    # Skip if URL is invalid
     if not source_info['link'] or not source_info['link'].startswith(('http://', 'https://')):
         return source.get('snippet', ''), source_info
-    # Check robots.txt first
     if not await check_robots_txt(source_info['link']):
         logging.info(f"Scraping disallowed by robots.txt for {source_info['link']}")
         return source.get('snippet', ''), source_info
@@ -241,12 +242,10 @@ async def process_web_source(session: aiohttp.ClientSession, source: dict, timeo
         logging.info(f"Processing source: {source_info['link']}")
         start_time = time.time()
-        # Skip non-HTML content
         if any(source_info['link'].lower().endswith(ext) for ext in ['.pdf', '.doc', '.docx', '.ppt', '.pptx', '.xls', '.xlsx']):
             logging.info(f"Skipping non-HTML content at {source_info['link']}")
             return source.get('snippet', ''), source_info
-        # Add delay between requests to be polite
         await asyncio.sleep(REQUEST_DELAY)
         async with session.get(source_info['link'], headers=headers, timeout=timeout, ssl=False) as response:
@@ -262,11 +261,9 @@ async def process_web_source(session: aiohttp.ClientSession, source: dict, timeo
             html = await response.text()
             soup = BeautifulSoup(html, "html.parser")
-            # Remove unwanted elements
             for tag in soup(['script', 'style', 'nav', 'footer', 'header', 'aside', 'iframe', 'noscript', 'form']):
                 tag.decompose()
-            # Try to find main content by common patterns
             selectors_to_try = [
                 'main',
                 'article',
@@ -289,7 +286,6 @@ async def process_web_source(session: aiohttp.ClientSession, source: dict, timeo
                     break
             if not main_content:
-                # If no main content found, try to find the largest text block
                 all_elements = soup.find_all()
                 candidates = [el for el in all_elements if el.name not in ['script', 'style', 'nav', 'footer', 'header']]
                 if candidates:
@@ -299,31 +295,25 @@ async def process_web_source(session: aiohttp.ClientSession, source: dict, timeo
             if not main_content:
                 main_content = soup.find('body') or soup
-            # Clean up the content
             content = " ".join(main_content.stripped_strings)
             content = re.sub(r'\s+', ' ', content).strip()
-            # If content is too short, try alternative extraction methods
             if len(content.split()) < 50 and len(html) > 10000:
-                # Try extracting all paragraphs
                 paras = soup.find_all('p')
                 content = " ".join([p.get_text() for p in paras if p.get_text().strip()])
                 content = re.sub(r'\s+', ' ', content).strip()
-                # If still too short, try getting all text nodes
                 if len(content.split()) < 50:
                     content = " ".join(soup.stripped_strings)
                     content = re.sub(r'\s+', ' ', content).strip()
-            # If content is still too short, try to extract from specific tags
             if len(content.split()) < 30:
-                # Try to get content from divs with certain classes
                 for tag in ['div', 'section', 'article']:
                     for element in soup.find_all(tag):
-                        if len(element.get_text().split()) > 200:  # If this element has substantial content
                             content = " ".join(element.stripped_strings)
                             content = re.sub(r'\s+', ' ', content).strip()
-                            if len(content.split()) >= 30:  # If we got enough content
                                 break
                     if len(content.split()) >= 30:
                         break
@@ -375,9 +365,8 @@ async def generate_research_plan(query: str, session: aiohttp.ClientSession) ->
                             cleaned_q = re.sub(r'^[^a-zA-Z0-9]*|[^a-zA-Z0-9]*$', '', q)
                             if cleaned_q:
                                 cleaned.append(cleaned_q)
-                    return cleaned[:6]  # Limit to 6 questions max
-        # Fallback if we couldn't get good questions from LLM
         return [
             f"What is {query} and its key features?",
             f"How does {query} compare to alternatives?",
@@ -394,35 +383,34 @@ async def generate_research_plan(query: str, session: aiohttp.ClientSession) ->
             f"What are the challenges with {query}?"
         ]
-async def continuous_search(query: str, search_time: int = 120) -> List[dict]:
     """
-    Perform continuous searching for better results within time constraints.
     """
     start_time = time.time()
     all_results = []
     seen_urls = set()
-    # Generate multiple variations of the query
     query_variations = [
         query,
         f"{query} comparison",
-        f"{query} analysis",
         f"{query} review",
-        f"{query} features",
-        f"{query} vs alternatives"
     ]
     async with aiohttp.ClientSession() as session:
         while time.time() - start_time < search_time:
-            # Shuffle the query variations to get diverse results
             random.shuffle(query_variations)
-            for q in query_variations[:3]:  # Only use first 3 variations in each iteration
                 if time.time() - start_time >= search_time:
                     logger.info(f"Search timed out after {search_time} seconds. Found {len(all_results)} results.")
                     break
-                logger.info(f"Searching for query variation: {q}")
                 try:
                     results = await fetch_search_results(q, max_results=5)
                     logger.info(f"Retrieved {len(results)} results for query '{q}'")
@@ -434,42 +422,31 @@ async def continuous_search(query: str, search_time: int = 120) -> List[dict]:
                             all_results.append(result)
                             logger.info(f"Added new result: {result['title']} ({result['link']})")
-                    # Small delay between searches
-                    await asyncio.sleep(1.0)
-                    # If we have enough unique results, we can stop early
-                    if len(all_results) >= MAX_SOURCES_TO_PROCESS * 1.5:  # Get more than we need for selection
                         logger.info(f"Reached sufficient results: {len(all_results)}")
                         break
                 except Exception as e:
                     logger.error(f"Error during search for '{q}': {e}")
-                    await asyncio.sleep(2.0)  # Wait a bit before trying again
-    logger.info(f"Completed continuous search. Total results: {len(all_results)}")
-    # Filter and sort results by relevance
     if all_results:
-        # Simple relevance scoring (could be enhanced with more sophisticated methods)
         def score_result(result):
-            # Score based on how many query terms appear in title/snippet
             query_terms = set(query.lower().split())
             title = result['title'].lower()
             snippet = result['snippet'].lower()
-            matches = 0
-            for term in query_terms:
-                if term in title or term in snippet:
-                    matches += 1
-            # Also consider length of snippet as a proxy for content richness
             snippet_length = len(result['snippet'].split())
             return matches * 10 + snippet_length
-        # Sort by score, descending
-        all_results.sort(key=lambda x: score_result(x), reverse=True)
-    return all_results[:MAX_SOURCES_TO_PROCESS * 2]  # Return more than we need for selection
 async def filter_and_select_sources(results: List[dict]) -> List[dict]:
     """
@@ -481,7 +458,6 @@ async def filter_and_select_sources(results: List[dict]) -> List[dict]:
     logger.info(f"Filtering {len(results)} search results...")
-    # Group by domain to ensure diversity
     domain_counts = defaultdict(int)
     domain_results = defaultdict(list)
     for result in results:
@@ -490,28 +466,20 @@ async def filter_and_select_sources(results: List[dict]) -> List[dict]:
         domain_results[domain].append(result)
     selected = []
-    # First pass: take the top result from each domain
     for domain, domain_res in domain_results.items():
         if len(selected) >= MAX_SOURCES_TO_PROCESS:
             break
-        # Take the best result from this domain (sorted by position in original results)
         if domain_res:
             selected.append(domain_res[0])
             logger.info(f"Selected top result from domain {domain}: {domain_res[0]['link']}")
-    # Second pass: if we need more, take additional results from domains with good content
     if len(selected) < MAX_SOURCES_TO_PROCESS:
-        # Calculate average snippet length as a proxy for content quality
         domain_quality = {}
         for domain, domain_res in domain_results.items():
             avg_length = sum(len(r['snippet'].split()) for r in domain_res) / len(domain_res)
             domain_quality[domain] = avg_length
-        # Sort domains by quality
         sorted_domains = sorted(domain_quality.items(), key=lambda x: x[1], reverse=True)
-        # Add more results from high-quality domains
         for domain, _ in sorted_domains:
             if len(selected) >= MAX_SOURCES_TO_PROCESS:
                 break
@@ -522,7 +490,6 @@ async def filter_and_select_sources(results: List[dict]) -> List[dict]:
                     if len(selected) >= MAX_SOURCES_TO_PROCESS:
                         break
-    # Final pass: if still need more, add remaining high-snippet-length results
     if len(selected) < MAX_SOURCES_TO_PROCESS:
         all_results_sorted = sorted(results, key=lambda x: len(x['snippet'].split()), reverse=True)
         for res in all_results_sorted:
@@ -535,7 +502,7 @@ async def filter_and_select_sources(results: List[dict]) -> List[dict]:
     logger.info(f"Selected {len(selected)} sources after filtering.")
     return selected[:MAX_SOURCES_TO_PROCESS]
-async def run_deep_research_stream(query: str, search_time: int = 120) -> AsyncGenerator[str, None]:
     def format_sse(data: dict) -> str:
         return f"data: {json.dumps(data)}\n\n"
@@ -545,19 +512,16 @@ async def run_deep_research_stream(query: str, search_time: int = 120) -> AsyncG
     total_tokens = 0
     try:
-        # Initialize the SSE stream with start message
         yield format_sse({
             "event": "status",
             "data": f"Starting deep research on '{query}'. Search time limit: {search_time} seconds."
         })
         async with aiohttp.ClientSession() as session:
-            # Step 1: Generate research plan
             yield format_sse({"event": "status", "data": "Generating comprehensive research plan..."})
             sub_questions = await generate_research_plan(query, session)
             yield format_sse({"event": "plan", "data": sub_questions})
-            # Step 2: Continuous search for better results
             yield format_sse({
                 "event": "status",
                 "data": f"Performing continuous search for up to {search_time} seconds..."
@@ -580,7 +544,6 @@ async def run_deep_research_stream(query: str, search_time: int = 120) -> AsyncG
                 })
                 return
-            # Select the best sources
             selected_sources = await filter_and_select_sources(search_results)
             yield format_sse({
                 "event": "status",
@@ -598,7 +561,6 @@ async def run_deep_research_stream(query: str, search_time: int = 120) -> AsyncG
                 })
                 return
-            # Step 3: Process selected sources with concurrency control
             semaphore = asyncio.Semaphore(MAX_CONCURRENT_REQUESTS)
             consolidated_context = ""
             all_sources_used = []
@@ -608,19 +570,16 @@ async def run_deep_research_stream(query: str, search_time: int = 120) -> AsyncG
                 async with semaphore:
                     return await process_web_source(session, source, timeout=20)
-            # Process sources with progress updates
             processing_tasks = []
             for i, source in enumerate(selected_sources):
-                # Check if we're running out of time
                 elapsed = time.time() - start_time
-                if elapsed > TOTAL_TIMEOUT * 0.8:  # Leave 20% of time for synthesis
                     yield format_sse({
                         "event": "status",
                         "data": f"Approaching time limit, stopping source processing at {i}/{len(selected_sources)}"
                     })
                     break
-                # Add delay between processing each source to be polite
                 if i > 0:
                     await asyncio.sleep(REQUEST_DELAY * 0.5)
@@ -633,7 +592,6 @@ async def run_deep_research_stream(query: str, search_time: int = 120) -> AsyncG
                         "data": f"Processed {min(i+1, len(selected_sources))}/{len(selected_sources)} sources..."
                     })
-            # Process completed tasks as they finish
             for future in asyncio.as_completed(processing_tasks):
                 processed_sources += 1
                 content, source_info = await future
@@ -641,7 +599,7 @@ async def run_deep_research_stream(query: str, search_time: int = 120) -> AsyncG
                     consolidated_context += f"Source: {source_info['link']}\nContent: {content}\n\n---\n\n"
                     all_sources_used.append(source_info)
                     successful_sources += 1
-                    total_tokens += len(content.split())  # Rough token count
                     yield format_sse({
                         "event": "processed_source",
                         "data": source_info
@@ -656,14 +614,13 @@ async def run_deep_research_stream(query: str, search_time: int = 120) -> AsyncG
                 })
                 return
-            # Step 4: Synthesize comprehensive report
             time_remaining = max(0, TOTAL_TIMEOUT - (time.time() - start_time))
             yield format_sse({
                 "event": "status",
                 "data": f"Synthesizing comprehensive report from {successful_sources} sources..."
             })
-            max_output_tokens = min(2000, int(time_remaining * 6))  # More aggressive token count
             report_prompt = f"""Compose an in-depth analysis report on "{query}".
@@ -684,7 +641,7 @@ async def run_deep_research_stream(query: str, search_time: int = 120) -> AsyncG
             Cite sources where appropriate using inline citations like [1][2].
             Available information from {successful_sources} sources:
-            {consolidated_context[:20000]}  # Increased context size
             Generate a comprehensive report of approximately {max_output_tokens//4} words.
             Focus on providing deep insights, analysis, and actionable information.
@@ -697,7 +654,6 @@ async def run_deep_research_stream(query: str, search_time: int = 120) -> AsyncG
                 "max_tokens": max_output_tokens
             }
-            # Stream the report generation
             async with session.post(LLM_API_URL, headers=LLM_HEADERS, json=report_payload) as response:
                 response.raise_for_status()
                 async for line in response.content:
@@ -724,7 +680,6 @@ async def run_deep_research_stream(query: str, search_time: int = 120) -> AsyncG
                         logging.warning(f"Error processing stream chunk: {e}")
                         continue
-            # Final status update
             duration = time.time() - start_time
             stats = {
                 "total_time_seconds": round(duration),
@@ -764,7 +719,7 @@ async def deep_research_endpoint(request: DeepResearchRequest):
     if not request.query or len(request.query.strip()) < 3:
         raise HTTPException(status_code=400, detail="Query must be at least 3 characters long")
-    search_time = min(max(request.search_time, 60), 180)  # Clamp between 60 and 180 seconds
     return StreamingResponse(
         run_deep_research_stream(request.query.strip(), search_time),
         media_type="text/event-stream"
@@ -772,4 +727,4 @@ async def deep_research_endpoint(request: DeepResearchRequest):
 if __name__ == "__main__":
     import uvicorn
-    uvicorn.run(app, host="0.0.0.0", port=8000)

 # --- Constants & Headers ---
 LLM_API_URL = "https://api.typegpt.net/v1/chat/completions"
 LLM_MODEL = "meta-llama/Llama-4-Maverick-17B-128E-Instruct-FP8"
+MAX_SOURCES_TO_PROCESS = 10
+MAX_CONCURRENT_REQUESTS = 3  # Reduced to avoid rate-limiting
+SEARCH_TIMEOUT = 90  # Reduced to ensure time for processing
+TOTAL_TIMEOUT = 180
+REQUEST_DELAY = 2.0  # Increased delay to avoid rate-limiting
 USER_AGENT_ROTATION = True
+RETRY_ATTEMPTS = 3  # Number of retries for failed search requests
+RETRY_DELAY = 3.0  # Delay between retries
 # Initialize fake user agent generator
 try:
 class DeepResearchRequest(BaseModel):
     query: str
+    search_time: int = 90  # Default to 90 seconds
 app = FastAPI(
     title="AI Deep Research API",
     """Get a realistic user agent string."""
     try:
         if isinstance(ua, UserAgent):
+            return ua.random
+        return ua.random()
     except:
         return "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/128.0.0.0 Safari/537.36"
     # Handle DuckDuckGo redirect URLs
     if url.startswith('//duckduckgo.com/l/'):
+        url = f"https:{url}"
         try:
             parsed = urlparse(url)
             query_params = parsed.query
             if 'uddg=' in query_params:
                 match = re.search(r'uddg=([^&]+)', query_params)
                 if match:
                     encoded_url = match.group(1)
                     try:
+                        # Properly decode the URL
+                        from urllib.parse import unquote
+                        return unquote(encoded_url)
                     except:
                         pass
         except:
                     robots = await response.text()
                     if "Disallow: /" in robots:
                         return False
                     path = re.sub(r'https?://[^/]+', '', url)
                     if any(f"Disallow: {p}" in robots for p in [path, path.rstrip('/') + '/']):
                         return False
 async def fetch_search_results(query: str, max_results: int = 5) -> List[dict]:
     """
+    Perform a real search using DuckDuckGo's HTML interface with retry logic.
     """
+    headers = {
+        "User-Agent": await get_real_user_agent(),
+        "Accept": "text/html,application/xhtml+xml,application/xml;q=0.9,image/webp,*/*;q=0.8",
+        "Accept-Language": "en-US,en;q=0.5",
+        "Referer": "https://duckduckgo.com/",
+        "DNT": "1"
+    }
+    for attempt in range(RETRY_ATTEMPTS):
+        try:
+            search_url = f"https://html.duckduckgo.com/html/?q={quote_plus(query)}"
+            async with aiohttp.ClientSession() as session:
+                async with session.get(search_url, headers=headers, timeout=10) as response:
+                    if response.status != 200:
+                        if response.status == 202:
+                            logging.warning(f"Search attempt {attempt + 1} failed with status 202 for query '{query}'")
+                            if attempt < RETRY_ATTEMPTS - 1:
+                                await asyncio.sleep(RETRY_DELAY)
                                 continue
+                        logging.warning(f"Search failed with status {response.status} for query '{query}'")
+                        return []
+                    html = await response.text()
+                    soup = BeautifulSoup(html, 'html.parser')
+                    results = []
+                    for selector in ['.result__body', '.result__a', '.result']:
+                        if len(results) >= max_results:
+                            break
+                        for result in soup.select(selector)[:max_results]:
+                            try:
+                                title_elem = result.select_one('.result__title .result__a') or result.select_one('.result__a')
+                                if not title_elem:
+                                    continue
+                                link = title_elem['href']
+                                snippet_elem = result.select_one('.result__snippet')
+                                clean_link = clean_url(link)
+                                if not clean_link or clean_link.startswith('javascript:'):
+                                    continue
+                                snippet = snippet_elem.get_text(strip=True) if snippet_elem else ""
+                                results.append({
+                                    'title': title_elem.get_text(strip=True),
+                                    'link': clean_link,
+                                    'snippet': snippet
+                                })
+                            except Exception as e:
+                                logging.warning(f"Error parsing search result: {e}")
                                 continue
+                    logging.info(f"Found {len(results)} real search results for '{query}'")
+                    return results[:max_results]
+        except Exception as e:
+            logging.error(f"Search attempt {attempt + 1} failed for '{query}': {e}")
+            if attempt < RETRY_ATTEMPTS - 1:
+                await asyncio.sleep(RETRY_DELAY)
+            continue
+    logging.error(f"All {RETRY_ATTEMPTS} search attempts failed for '{query}'")
+    return []
 async def process_web_source(session: aiohttp.ClientSession, source: dict, timeout: int = 15) -> Tuple[str, dict]:
     """
     """
     headers = {'User-Agent': await get_real_user_agent()}
     source_info = source.copy()
+    source_info['link'] = clean_url(source['link'])
     if not source_info['link'] or not source_info['link'].startswith(('http://', 'https://')):
+        logging.warning(f"Invalid URL: {source_info['link']}")
         return source.get('snippet', ''), source_info
     if not await check_robots_txt(source_info['link']):
         logging.info(f"Scraping disallowed by robots.txt for {source_info['link']}")
         return source.get('snippet', ''), source_info
         logging.info(f"Processing source: {source_info['link']}")
         start_time = time.time()
         if any(source_info['link'].lower().endswith(ext) for ext in ['.pdf', '.doc', '.docx', '.ppt', '.pptx', '.xls', '.xlsx']):
             logging.info(f"Skipping non-HTML content at {source_info['link']}")
             return source.get('snippet', ''), source_info
         await asyncio.sleep(REQUEST_DELAY)
         async with session.get(source_info['link'], headers=headers, timeout=timeout, ssl=False) as response:
             html = await response.text()
             soup = BeautifulSoup(html, "html.parser")
             for tag in soup(['script', 'style', 'nav', 'footer', 'header', 'aside', 'iframe', 'noscript', 'form']):
                 tag.decompose()
             selectors_to_try = [
                 'main',
                 'article',
                     break
             if not main_content:
                 all_elements = soup.find_all()
                 candidates = [el for el in all_elements if el.name not in ['script', 'style', 'nav', 'footer', 'header']]
                 if candidates:
             if not main_content:
                 main_content = soup.find('body') or soup
             content = " ".join(main_content.stripped_strings)
             content = re.sub(r'\s+', ' ', content).strip()
             if len(content.split()) < 50 and len(html) > 10000:
                 paras = soup.find_all('p')
                 content = " ".join([p.get_text() for p in paras if p.get_text().strip()])
                 content = re.sub(r'\s+', ' ', content).strip()
                 if len(content.split()) < 50:
                     content = " ".join(soup.stripped_strings)
                     content = re.sub(r'\s+', ' ', content).strip()
             if len(content.split()) < 30:
                 for tag in ['div', 'section', 'article']:
                     for element in soup.find_all(tag):
+                        if len(element.get_text().split()) > 200:
                             content = " ".join(element.stripped_strings)
                             content = re.sub(r'\s+', ' ', content).strip()
+                            if len(content.split()) >= 30:
                                 break
                     if len(content.split()) >= 30:
                         break
                             cleaned_q = re.sub(r'^[^a-zA-Z0-9]*|[^a-zA-Z0-9]*$', '', q)
                             if cleaned_q:
                                 cleaned.append(cleaned_q)
+                    return cleaned[:6]
         return [
             f"What is {query} and its key features?",
             f"How does {query} compare to alternatives?",
             f"What are the challenges with {query}?"
         ]
+async def continuous_search(query: str, search_time: int = 90) -> List[dict]:
     """
+    Perform continuous searching with retries and diverse queries.
     """
     start_time = time.time()
     all_results = []
     seen_urls = set()
     query_variations = [
         query,
         f"{query} comparison",
         f"{query} review",
+        f"{query} latest developments",
+        f"{query} features and benefits",
+        f"{query} challenges and limitations"
     ]
     async with aiohttp.ClientSession() as session:
+        iteration = 0
         while time.time() - start_time < search_time:
+            iteration += 1
             random.shuffle(query_variations)
+            for q in query_variations[:3]:
                 if time.time() - start_time >= search_time:
                     logger.info(f"Search timed out after {search_time} seconds. Found {len(all_results)} results.")
                     break
+                logger.info(f"Iteration {iteration}: Searching for query variation: {q}")
                 try:
                     results = await fetch_search_results(q, max_results=5)
                     logger.info(f"Retrieved {len(results)} results for query '{q}'")
                             all_results.append(result)
                             logger.info(f"Added new result: {result['title']} ({result['link']})")
+                    await asyncio.sleep(REQUEST_DELAY)
+                    if len(all_results) >= MAX_SOURCES_TO_PROCESS * 1.5:
                         logger.info(f"Reached sufficient results: {len(all_results)}")
                         break
                 except Exception as e:
                     logger.error(f"Error during search for '{q}': {e}")
+                    await asyncio.sleep(RETRY_DELAY)
+            if len(all_results) >= MAX_SOURCES_TO_PROCESS * 1.5:
+                break
+        logger.info(f"Completed continuous search. Total results: {len(all_results)}")
     if all_results:
         def score_result(result):
             query_terms = set(query.lower().split())
             title = result['title'].lower()
             snippet = result['snippet'].lower()
+            matches = sum(1 for term in query_terms if term in title or term in snippet)
             snippet_length = len(result['snippet'].split())
             return matches * 10 + snippet_length
+        all_results.sort(key=score_result, reverse=True)
+    return all_results[:MAX_SOURCES_TO_PROCESS * 2]
 async def filter_and_select_sources(results: List[dict]) -> List[dict]:
     """
     logger.info(f"Filtering {len(results)} search results...")
     domain_counts = defaultdict(int)
     domain_results = defaultdict(list)
     for result in results:
         domain_results[domain].append(result)
     selected = []
     for domain, domain_res in domain_results.items():
         if len(selected) >= MAX_SOURCES_TO_PROCESS:
             break
         if domain_res:
             selected.append(domain_res[0])
             logger.info(f"Selected top result from domain {domain}: {domain_res[0]['link']}")
     if len(selected) < MAX_SOURCES_TO_PROCESS:
         domain_quality = {}
         for domain, domain_res in domain_results.items():
             avg_length = sum(len(r['snippet'].split()) for r in domain_res) / len(domain_res)
             domain_quality[domain] = avg_length
         sorted_domains = sorted(domain_quality.items(), key=lambda x: x[1], reverse=True)
         for domain, _ in sorted_domains:
             if len(selected) >= MAX_SOURCES_TO_PROCESS:
                 break
                     if len(selected) >= MAX_SOURCES_TO_PROCESS:
                         break
     if len(selected) < MAX_SOURCES_TO_PROCESS:
         all_results_sorted = sorted(results, key=lambda x: len(x['snippet'].split()), reverse=True)
         for res in all_results_sorted:
     logger.info(f"Selected {len(selected)} sources after filtering.")
     return selected[:MAX_SOURCES_TO_PROCESS]
+async def run_deep_research_stream(query: str, search_time: int = 90) -> AsyncGenerator[str, None]:
     def format_sse(data: dict) -> str:
         return f"data: {json.dumps(data)}\n\n"
     total_tokens = 0
     try:
         yield format_sse({
             "event": "status",
             "data": f"Starting deep research on '{query}'. Search time limit: {search_time} seconds."
         })
         async with aiohttp.ClientSession() as session:
             yield format_sse({"event": "status", "data": "Generating comprehensive research plan..."})
             sub_questions = await generate_research_plan(query, session)
             yield format_sse({"event": "plan", "data": sub_questions})
             yield format_sse({
                 "event": "status",
                 "data": f"Performing continuous search for up to {search_time} seconds..."
                 })
                 return
             selected_sources = await filter_and_select_sources(search_results)
             yield format_sse({
                 "event": "status",
                 })
                 return
             semaphore = asyncio.Semaphore(MAX_CONCURRENT_REQUESTS)
             consolidated_context = ""
             all_sources_used = []
                 async with semaphore:
                     return await process_web_source(session, source, timeout=20)
             processing_tasks = []
             for i, source in enumerate(selected_sources):
                 elapsed = time.time() - start_time
+                if elapsed > TOTAL_TIMEOUT * 0.8:
                     yield format_sse({
                         "event": "status",
                         "data": f"Approaching time limit, stopping source processing at {i}/{len(selected_sources)}"
                     })
                     break
                 if i > 0:
                     await asyncio.sleep(REQUEST_DELAY * 0.5)
                         "data": f"Processed {min(i+1, len(selected_sources))}/{len(selected_sources)} sources..."
                     })
             for future in asyncio.as_completed(processing_tasks):
                 processed_sources += 1
                 content, source_info = await future
                     consolidated_context += f"Source: {source_info['link']}\nContent: {content}\n\n---\n\n"
                     all_sources_used.append(source_info)
                     successful_sources += 1
+                    total_tokens += len(content.split())
                     yield format_sse({
                         "event": "processed_source",
                         "data": source_info
                 })
                 return
             time_remaining = max(0, TOTAL_TIMEOUT - (time.time() - start_time))
             yield format_sse({
                 "event": "status",
                 "data": f"Synthesizing comprehensive report from {successful_sources} sources..."
             })
+            max_output_tokens = min(2000, int(time_remaining * 6))
             report_prompt = f"""Compose an in-depth analysis report on "{query}".
             Cite sources where appropriate using inline citations like [1][2].
             Available information from {successful_sources} sources:
+            {consolidated_context[:20000]}
             Generate a comprehensive report of approximately {max_output_tokens//4} words.
             Focus on providing deep insights, analysis, and actionable information.
                 "max_tokens": max_output_tokens
             }
             async with session.post(LLM_API_URL, headers=LLM_HEADERS, json=report_payload) as response:
                 response.raise_for_status()
                 async for line in response.content:
                         logging.warning(f"Error processing stream chunk: {e}")
                         continue
             duration = time.time() - start_time
             stats = {
                 "total_time_seconds": round(duration),
     if not request.query or len(request.query.strip()) < 3:
         raise HTTPException(status_code=400, detail="Query must be at least 3 characters long")
+    search_time = min(max(request.search_time, 60), 180)
     return StreamingResponse(
         run_deep_research_stream(request.query.strip(), search_time),
         media_type="text/event-stream"
 if __name__ == "__main__":
     import uvicorn
+    uvicorn.run(app, host="0.0.0.0", port=8000)