Spaces:

rkihacker
/

Scrap

Paused

App Files Files Community

rkihacker commited on Sep 17

Commit

4118768

verified ·

1 Parent(s): 4d61bdb

Update main.py

Browse files

Files changed (1) hide show

main.py +201 -158

main.py CHANGED Viewed

@@ -34,11 +34,11 @@ else:
 # --- Constants & Headers ---
 LLM_API_URL = "https://api.typegpt.net/v1/chat/completions"
 LLM_MODEL = "meta-llama/Llama-4-Maverick-17B-128E-Instruct-FP8"
-MAX_SOURCES_TO_PROCESS = 10  # Increased to get more comprehensive results
-MAX_CONCURRENT_REQUESTS = 5   # Increased for faster processing
-SEARCH_TIMEOUT = 120          # 2 minutes for searching (adjustable)
-TOTAL_TIMEOUT = 180           # 3 minutes total
-REQUEST_DELAY = 1.0           # Shorter delay between requests
 USER_AGENT_ROTATION = True
 # Initialize fake user agent generator
@@ -62,12 +62,12 @@ LLM_HEADERS = {
 class DeepResearchRequest(BaseModel):
     query: str
-    search_time: int = 120  # Default to 2 minutes
 app = FastAPI(
     title="AI Deep Research API",
-    description="Provides comprehensive research reports from real web searches within 1-2 minutes.",
-    version="3.0.0"
 )
 app.add_middleware(
     CORSMiddleware,
@@ -79,7 +79,9 @@ app.add_middleware(
 def extract_json_from_llm_response(text: str) -> Optional[list]:
     """Extract JSON array from LLM response text."""
-    match = re.search(r'\[.*\]', text, re.DOTALL)
     if match:
         try:
             return json.loads(match.group(0))
@@ -105,19 +107,19 @@ def clean_url(url: str) -> str:
     if url.startswith('//duckduckgo.com/l/'):
         url = f"https:{url}"  # Make it a proper URL
         try:
-            # Extract the real URL from DuckDuckGo's redirect
             parsed = urlparse(url)
             query_params = parsed.query
             if 'uddg=' in query_params:
-                # Extract the actual URL from the parameter
                 match = re.search(r'uddg=([^&]+)', query_params)
                 if match:
                     encoded_url = match.group(1)
                     try:
-                        url = quote_plus(encoded_url)  # This might need better decoding
-                        # For simplicity, we'll just return the decoded URL
-                        # In production, you'd want to properly URL-decode this
-                        return encoded_url
                     except:
                         pass
         except:
@@ -193,7 +195,7 @@ async def fetch_search_results(query: str, max_results: int = 5) -> List[dict]:
                                 continue
                             link = title_elem['href']
-                            snippet_elem = result.select_one('.result__snippet')
                             # Clean the URL
                             clean_link = clean_url(link)
@@ -205,10 +207,15 @@ async def fetch_search_results(query: str, max_results: int = 5) -> List[dict]:
                             # Get snippet if available
                             snippet = snippet_elem.get_text(strip=True) if snippet_elem else ""
                             results.append({
                                 'title': title_elem.get_text(strip=True),
                                 'link': clean_link,
-                                'snippet': snippet
                             })
                         except Exception as e:
                             logging.warning(f"Error parsing search result: {e}")
@@ -372,7 +379,7 @@ async def generate_research_plan(query: str, session: aiohttp.ClientSession) ->
                     cleaned = []
                     for q in sub_questions:
                         if isinstance(q, str) and q.strip():
-                            cleaned_q = re.sub(r'^[^a-zA-Z0-9]*|[^a-zA-Z0-9]*$', '', q)
                             if cleaned_q:
                                 cleaned.append(cleaned_q)
                     return cleaned[:6]  # Limit to 6 questions max
@@ -397,10 +404,13 @@ async def generate_research_plan(query: str, session: aiohttp.ClientSession) ->
 async def continuous_search(query: str, search_time: int = 120) -> List[dict]:
     """
     Perform continuous searching for better results within time constraints.
     """
     start_time = time.time()
     all_results = []
     seen_urls = set()
     # Generate multiple variations of the query
     query_variations = [
@@ -409,43 +419,70 @@ async def continuous_search(query: str, search_time: int = 120) -> List[dict]:
         f"{query} analysis",
         f"{query} review",
         f"{query} features",
-        f"{query} vs alternatives"
     ]
     async with aiohttp.ClientSession() as session:
         while time.time() - start_time < search_time:
             # Shuffle the query variations to get diverse results
             random.shuffle(query_variations)
-            for q in query_variations[:3]:  # Only use first 3 variations in each iteration
                 if time.time() - start_time >= search_time:
                     break
                 try:
                     results = await fetch_search_results(q, max_results=5)
-                    for result in results:
-                        clean_link = clean_url(result['link'])
-                        if clean_link and clean_link not in seen_urls:
                             seen_urls.add(clean_link)
                             result['link'] = clean_link
                             all_results.append(result)
-                            logging.info(f"Found new result: {result['title']}")
                     # Small delay between searches
                     await asyncio.sleep(1.0)
                     # If we have enough unique results, we can stop early
-                    if len(all_results) >= MAX_SOURCES_TO_PROCESS * 1.5:  # Get more than we need for selection
                         break
                 except Exception as e:
                     logging.error(f"Error during continuous search: {e}")
                     await asyncio.sleep(2.0)  # Wait a bit before trying again
     # Filter and sort results by relevance
     if all_results:
-        # Simple relevance scoring (could be enhanced with more sophisticated methods)
         def score_result(result):
-            # Score based on how many query terms appear in title/snippet
             query_terms = set(query.lower().split())
             title = result['title'].lower()
             snippet = result['snippet'].lower()
@@ -458,7 +495,11 @@ async def continuous_search(query: str, search_time: int = 120) -> List[dict]:
             # Also consider length of snippet as a proxy for content richness
             snippet_length = len(result['snippet'].split())
-            return matches * 10 + snippet_length
         # Sort by score, descending
         all_results.sort(key=lambda x: score_result(x), reverse=True)
@@ -468,19 +509,21 @@ async def continuous_search(query: str, search_time: int = 120) -> List[dict]:
 async def filter_and_select_sources(results: List[dict]) -> List[dict]:
     """
     Filter and select the best sources from search results.
     """
     if not results:
-        return []
     # Group by domain to ensure diversity
     domain_counts = defaultdict(int)
     domain_results = defaultdict(list)
     for result in results:
-        domain = urlparse(result['link']).netloc
         domain_counts[domain] += 1
         domain_results[domain].append(result)
     selected = []
     # First pass: take the top result from each domain
     for domain, domain_res in domain_results.items():
@@ -488,6 +531,8 @@ async def filter_and_select_sources(results: List[dict]) -> List[dict]:
             break
         # Take the best result from this domain (sorted by position in original results)
         if domain_res:
             selected.append(domain_res[0])
     # Second pass: if we need more, take additional results from domains with good content
@@ -495,6 +540,8 @@ async def filter_and_select_sources(results: List[dict]) -> List[dict]:
         # Calculate average snippet length as a proxy for content quality
         domain_quality = {}
         for domain, domain_res in domain_results.items():
             avg_length = sum(len(r['snippet'].split()) for r in domain_res) / len(domain_res)
             domain_quality[domain] = avg_length
@@ -511,16 +558,19 @@ async def filter_and_select_sources(results: List[dict]) -> List[dict]:
                     if len(selected) >= MAX_SOURCES_TO_PROCESS:
                         break
-    # Final pass: if still need more, add remaining high-snippet-length results
     if len(selected) < MAX_SOURCES_TO_PROCESS:
-        all_results_sorted = sorted(results, key=lambda x: len(x['snippet'].split()), reverse=True)
-        for res in all_results_sorted:
-            if res not in selected:
-                selected.append(res)
-                if len(selected) >= MAX_SOURCES_TO_PROCESS:
-                    break
-    return selected[:MAX_SOURCES_TO_PROCESS]
 async def run_deep_research_stream(query: str, search_time: int = 120) -> AsyncGenerator[str, None]:
     def format_sse(data: dict) -> str:
@@ -535,27 +585,66 @@ async def run_deep_research_stream(query: str, search_time: int = 120) -> AsyncG
         # Initialize the SSE stream with start message
         yield format_sse({
             "event": "status",
-            "data": f"Starting deep research on '{query}'. Search time limit: {search_time} seconds."
         })
         async with aiohttp.ClientSession() as session:
             # Step 1: Generate research plan
             yield format_sse({"event": "status", "data": "Generating comprehensive research plan..."})
             sub_questions = await generate_research_plan(query, session)
-            yield format_sse({"event": "plan", "data": sub_questions})
             # Step 2: Continuous search for better results
             yield format_sse({
                 "event": "status",
-                "data": f"Performing continuous search for up to {search_time} seconds..."
             })
             search_results = await continuous_search(query, search_time)
             yield format_sse({
                 "event": "status",
-                "data": f"Found {len(search_results)} potential sources. Selecting the best ones..."
             })
             if not search_results:
                 yield format_sse({
                     "event": "error",
@@ -564,10 +653,13 @@ async def run_deep_research_stream(query: str, search_time: int = 120) -> AsyncG
                 return
             # Select the best sources
-            selected_sources = await filter_and_select_sources(search_results)
             yield format_sse({
                 "event": "status",
-                "data": f"Selected {len(selected_sources)} high-quality sources to process."
             })
             if not selected_sources:
@@ -577,6 +669,20 @@ async def run_deep_research_stream(query: str, search_time: int = 120) -> AsyncG
                 })
                 return
             # Step 3: Process selected sources with concurrency control
             semaphore = asyncio.Semaphore(MAX_CONCURRENT_REQUESTS)
             consolidated_context = ""
@@ -595,7 +701,7 @@ async def run_deep_research_stream(query: str, search_time: int = 120) -> AsyncG
                 if elapsed > TOTAL_TIMEOUT * 0.8:  # Leave 20% of time for synthesis
                     yield format_sse({
                         "event": "status",
-                        "data": f"Approaching time limit, stopping source processing at {i}/{len(selected_sources)}"
                     })
                     break
@@ -603,26 +709,57 @@ async def run_deep_research_stream(query: str, search_time: int = 120) -> AsyncG
                 if i > 0:
                     await asyncio.sleep(REQUEST_DELAY * 0.5)
                 task = asyncio.create_task(process_with_semaphore(source))
                 processing_tasks.append(task)
-                if (i + 1) % 2 == 0 or (i + 1) == len(selected_sources):
-                    yield format_sse({
-                        "event": "status",
-                        "data": f"Processed {min(i+1, len(selected_sources))}/{len(selected_sources)} sources..."
-                    })
             # Process completed tasks as they finish
             for future in asyncio.as_completed(processing_tasks):
                 processed_sources += 1
                 content, source_info = await future
                 if content and content.strip():
                     consolidated_context += f"Source: {source_info['link']}\nContent: {content}\n\n---\n\n"
                     all_sources_used.append(source_info)
                     successful_sources += 1
-                    total_tokens += len(content.split())  # Rough token count
                 else:
                     processing_errors += 1
             if not consolidated_context.strip():
                 yield format_sse({
@@ -631,120 +768,26 @@ async def run_deep_research_stream(query: str, search_time: int = 120) -> AsyncG
                 })
                 return
             # Step 4: Synthesize comprehensive report
             time_remaining = max(0, TOTAL_TIMEOUT - (time.time() - start_time))
             yield format_sse({
                 "event": "status",
-                "data": f"Synthesizing comprehensive report from {successful_sources} sources..."
             })
             max_output_tokens = min(2000, int(time_remaining * 6))  # More aggressive token count
-            report_prompt = f"""Compose an in-depth analysis report on "{query}".
             Structure the report with these sections:
-            1. Introduction and Background
             2. Key Features and Capabilities
-            3. Comparative Analysis with Alternatives
-            4. Current Developments and Trends
-            5. Challenges and Limitations
-            6. Future Outlook
-            7. Conclusion and Recommendations
-            For each section, provide detailed analysis based on the source material.
-            Include specific examples and data points from the sources when available.
-            Compare and contrast different viewpoints from various sources.
-            Use markdown formatting for headings, subheadings, lists, and emphasis.
-            Cite sources where appropriate using inline citations like [1][2].
-            Available information from {successful_sources} sources:
-            {consolidated_context[:20000]}  # Increased context size
-            Generate a comprehensive report of approximately {max_output_tokens//4} words.
-            Focus on providing deep insights, analysis, and actionable information.
-            """
-            report_payload = {
-                "model": LLM_MODEL,
-                "messages": [{"role": "user", "content": report_prompt}],
-                "stream": True,
-                "max_tokens": max_output_tokens
-            }
-            # Stream the report generation
-            async with session.post(LLM_API_URL, headers=LLM_HEADERS, json=report_payload) as response:
-                response.raise_for_status()
-                async for line in response.content:
-                    if time.time() - start_time > TOTAL_TIMEOUT:
-                        yield format_sse({
-                            "event": "warning",
-                            "data": "Time limit reached, ending report generation early."
-                        })
-                        break
-                    line_str = line.decode('utf-8').strip()
-                    if line_str.startswith('data:'):
-                        line_str = line_str[5:].strip()
-                    if line_str == "[DONE]":
-                        break
-                    try:
-                        chunk = json.loads(line_str)
-                        choices = chunk.get("choices")
-                        if choices and isinstance(choices, list) and len(choices) > 0:
-                            content = choices[0].get("delta", {}).get("content")
-                            if content:
-                                yield format_sse({"event": "chunk", "data": content})
-                    except Exception as e:
-                        logging.warning(f"Error processing stream chunk: {e}")
-                        continue
-            # Final status update
-            duration = time.time() - start_time
-            stats = {
-                "total_time_seconds": round(duration),
-                "sources_processed": processed_sources,
-                "sources_successful": successful_sources,
-                "estimated_tokens": total_tokens,
-                "sources_used": len(all_sources_used)
-            }
-            yield format_sse({
-                "event": "status",
-                "data": f"Research completed successfully in {duration:.1f} seconds."
-            })
-            yield format_sse({"event": "stats", "data": stats})
-            yield format_sse({"event": "sources", "data": all_sources_used})
-    except asyncio.TimeoutError:
-        yield format_sse({
-            "event": "error",
-            "data": f"Research process timed out after {TOTAL_TIMEOUT} seconds."
-        })
-    except Exception as e:
-        logging.error(f"Critical error in research process: {e}", exc_info=True)
-        yield format_sse({
-            "event": "error",
-            "data": f"An unexpected error occurred: {str(e)[:200]}"
-        })
-    finally:
-        duration = time.time() - start_time
-        yield format_sse({
-            "event": "complete",
-            "data": f"Research process finished after {duration:.1f} seconds."
-        })
-@app.post("/deep-research", response_class=StreamingResponse)
-async def deep_research_endpoint(request: DeepResearchRequest):
-    """Endpoint for deep research that streams SSE responses."""
-    if not request.query or len(request.query.strip()) < 3:
-        raise HTTPException(status_code=400, detail="Query must be at least 3 characters long")
-    search_time = min(max(request.search_time, 60), 180)  # Clamp between 60 and 180 seconds
-    return StreamingResponse(
-        run_deep_research_stream(request.query.strip(), search_time),
-        media_type="text/event-stream"
-    )
-if __name__ == "__main__":
-    import uvicorn
-    uvicorn.run(app, host="0.0.0.0", port=8000)

 # --- Constants & Headers ---
 LLM_API_URL = "https://api.typegpt.net/v1/chat/completions"
 LLM_MODEL = "meta-llama/Llama-4-Maverick-17B-128E-Instruct-FP8"
+MAX_SOURCES_TO_PROCESS = 10
+MAX_CONCURRENT_REQUESTS = 5
+SEARCH_TIMEOUT = 120  # Default search time in seconds
+TOTAL_TIMEOUT = 180   # Total time limit in seconds
+REQUEST_DELAY = 1.0   # Delay between requests in seconds
 USER_AGENT_ROTATION = True
 # Initialize fake user agent generator
 class DeepResearchRequest(BaseModel):
     query: str
+    search_time: int = SEARCH_TIMEOUT  # Default search time
 app = FastAPI(
     title="AI Deep Research API",
+    description="Provides comprehensive research reports from real web searches.",
+    version="3.1.0"
 )
 app.add_middleware(
     CORSMiddleware,
 def extract_json_from_llm_response(text: str) -> Optional[list]:
     """Extract JSON array from LLM response text."""
+    match = re.search(r'$$
+.*
+$$', text, re.DOTALL)
     if match:
         try:
             return json.loads(match.group(0))
     if url.startswith('//duckduckgo.com/l/'):
         url = f"https:{url}"  # Make it a proper URL
         try:
             parsed = urlparse(url)
             query_params = parsed.query
             if 'uddg=' in query_params:
                 match = re.search(r'uddg=([^&]+)', query_params)
                 if match:
                     encoded_url = match.group(1)
                     try:
+                        # URL decode the parameter
+                        decoded_url = quote_plus(encoded_url)
+                        # Sometimes it's double-encoded
+                        if '%25' in decoded_url:
+                            decoded_url = quote_plus(decoded_url)
+                        return decoded_url
                     except:
                         pass
         except:
                                 continue
                             link = title_elem['href']
+                            snippet_elem = result.select_one('.result__snippet') or result.select_one('.result__body')
                             # Clean the URL
                             clean_link = clean_url(link)
                             # Get snippet if available
                             snippet = snippet_elem.get_text(strip=True) if snippet_elem else ""
+                            # Skip if we already have this URL
+                            if any(r['link'] == clean_link for r in results):
+                                continue
                             results.append({
                                 'title': title_elem.get_text(strip=True),
                                 'link': clean_link,
+                                'snippet': snippet,
+                                'source': 'duckduckgo'
                             })
                         except Exception as e:
                             logging.warning(f"Error parsing search result: {e}")
                     cleaned = []
                     for q in sub_questions:
                         if isinstance(q, str) and q.strip():
+                            cleaned_q = re.sub(r'^[^a-zA-Z0-9]*|[^a-zA-Z0-9]*\$', '', q)
                             if cleaned_q:
                                 cleaned.append(cleaned_q)
                     return cleaned[:6]  # Limit to 6 questions max
 async def continuous_search(query: str, search_time: int = 120) -> List[dict]:
     """
     Perform continuous searching for better results within time constraints.
+    Provides detailed feedback about the search process.
     """
     start_time = time.time()
     all_results = []
     seen_urls = set()
+    seen_domains = defaultdict(int)
+    search_iterations = 0
     # Generate multiple variations of the query
     query_variations = [
         f"{query} analysis",
         f"{query} review",
         f"{query} features",
+        f"{query} vs alternatives",
+        f"latest {query} news",
+        f"{query} pros and cons"
     ]
     async with aiohttp.ClientSession() as session:
         while time.time() - start_time < search_time:
+            search_iterations += 1
             # Shuffle the query variations to get diverse results
             random.shuffle(query_variations)
+            # Use only a subset of queries each iteration
+            queries_for_this_iteration = query_variations[:min(3, len(query_variations))]
+            for q in queries_for_this_iteration:
                 if time.time() - start_time >= search_time:
                     break
                 try:
+                    # Notify about current search
+                    logging.info(f"Searching for: '{q}'")
                     results = await fetch_search_results(q, max_results=5)
+                    if results:
+                        for result in results:
+                            clean_link = clean_url(result['link'])
+                            domain = urlparse(clean_link).netloc if clean_link else ""
+                            # Skip if we've already seen this URL
+                            if clean_link in seen_urls:
+                                continue
+                            # Skip if we have too many results from this domain
+                            if domain and seen_domains[domain] >= 2:  # Max 2 results per domain
+                                continue
                             seen_urls.add(clean_link)
+                            if domain:
+                                seen_domains[domain] += 1
                             result['link'] = clean_link
                             all_results.append(result)
+                            logging.info(f"Found new result: {result['title']} ({domain})")
                     # Small delay between searches
                     await asyncio.sleep(1.0)
                     # If we have enough unique results, we can stop early
+                    if len(all_results) >= MAX_SOURCES_TO_PROCESS * 2:  # Get more than we need for selection
+                        logging.info(f"Found enough unique results ({len(all_results)})")
                         break
                 except Exception as e:
                     logging.error(f"Error during continuous search: {e}")
                     await asyncio.sleep(2.0)  # Wait a bit before trying again
+            # Break if we've done several iterations
+            if search_iterations >= 4:  # Limit to 4 search iterations
+                break
     # Filter and sort results by relevance
     if all_results:
+        # Simple relevance scoring
         def score_result(result):
             query_terms = set(query.lower().split())
             title = result['title'].lower()
             snippet = result['snippet'].lower()
             # Also consider length of snippet as a proxy for content richness
             snippet_length = len(result['snippet'].split())
+            # Prefer results from diverse domains
+            domain = urlparse(result['link']).netloc if result['link'] else ""
+            domain_score = 10 if seen_domains[domain] <= 1 else 5  # Bonus for unique domains
+            return matches * 10 + snippet_length + domain_score
         # Sort by score, descending
         all_results.sort(key=lambda x: score_result(x), reverse=True)
 async def filter_and_select_sources(results: List[dict]) -> List[dict]:
     """
     Filter and select the best sources from search results.
+    Returns a tuple of (selected_sources, rejected_sources_with_reasons)
     """
     if not results:
+        return [], []
     # Group by domain to ensure diversity
     domain_counts = defaultdict(int)
     domain_results = defaultdict(list)
     for result in results:
+        domain = urlparse(result['link']).netloc if result['link'] else ""
         domain_counts[domain] += 1
         domain_results[domain].append(result)
     selected = []
+    rejected = []
     # First pass: take the top result from each domain
     for domain, domain_res in domain_results.items():
             break
         # Take the best result from this domain (sorted by position in original results)
         if domain_res:
+            # Sort domain results by snippet length (proxy for content richness)
+            domain_res.sort(key=lambda x: len(x['snippet'].split()), reverse=True)
             selected.append(domain_res[0])
     # Second pass: if we need more, take additional results from domains with good content
         # Calculate average snippet length as a proxy for content quality
         domain_quality = {}
         for domain, domain_res in domain_results.items():
+            if not domain_res:
+                continue
             avg_length = sum(len(r['snippet'].split()) for r in domain_res) / len(domain_res)
             domain_quality[domain] = avg_length
                     if len(selected) >= MAX_SOURCES_TO_PROCESS:
                         break
+    # Third pass: if still need more, add remaining high-snippet-length results
     if len(selected) < MAX_SOURCES_TO_PROCESS:
+        # Sort all results by snippet length
+        remaining_results = [res for res in results if res not in selected]
+        remaining_results.sort(key=lambda x: len(x['snippet'].split()), reverse=True)
+        while len(selected) < MAX_SOURCES_TO_PROCESS and remaining_results:
+            selected.append(remaining_results.pop(0))
+    # The remaining results are our rejected ones (for now we won't track reasons)
+    rejected = [res for res in results if res not in selected]
+    return selected, rejected
 async def run_deep_research_stream(query: str, search_time: int = 120) -> AsyncGenerator[str, None]:
     def format_sse(data: dict) -> str:
         # Initialize the SSE stream with start message
         yield format_sse({
             "event": "status",
+            "data": f"Starting deep research on '{query}'. Searching for comprehensive sources..."
         })
         async with aiohttp.ClientSession() as session:
             # Step 1: Generate research plan
             yield format_sse({"event": "status", "data": "Generating comprehensive research plan..."})
             sub_questions = await generate_research_plan(query, session)
+            yield format_sse({
+                "event": "plan",
+                "data": {
+                    "sub_questions": sub_questions,
+                    "message": f"Research will focus on these {len(sub_questions)} key aspects"
+                }
+            })
             # Step 2: Continuous search for better results
             yield format_sse({
                 "event": "status",
+                "data": "Performing intelligent search for high-quality sources..."
+            })
+            # Show search variations we'll use
+            query_variations = [
+                query,
+                f"{query} comparison",
+                f"{query} analysis",
+                f"{query} review",
+                f"{query} features",
+                f"{query} vs alternatives"
+            ]
+            yield format_sse({
+                "event": "status",
+                "data": f"Using {len(query_variations)} different search variations to find diverse sources"
             })
             search_results = await continuous_search(query, search_time)
+            # Report on search results
+            unique_domains = len({urlparse(r['link']).netloc for r in search_results if r['link']})
             yield format_sse({
                 "event": "status",
+                "data": f"Found {len(search_results)} potential sources from {unique_domains} unique domains"
             })
+            # Display some of the top sources found
+            if search_results:
+                top_sources = search_results[:5]  # Show top 5
+                sources_list = []
+                for i, source in enumerate(top_sources, 1):
+                    domain = urlparse(source['link']).netloc if source['link'] else "Unknown"
+                    sources_list.append(f"{i}. {source['title']} ({domain})")
+                yield format_sse({
+                    "event": "sources_found",
+                    "data": {
+                        "top_sources": sources_list,
+                        "total_sources": len(search_results)
+                    }
+                })
             if not search_results:
                 yield format_sse({
                     "event": "error",
                 return
             # Select the best sources
+            selected_sources, rejected_sources = await filter_and_select_sources(search_results)
+            # Report on selected sources
+            unique_selected_domains = len({urlparse(r['link']).netloc for r in selected_sources if r['link']})
             yield format_sse({
                 "event": "status",
+                "data": f"Selected {len(selected_sources)} high-quality sources from {unique_selected_domains} unique domains for in-depth analysis"
             })
             if not selected_sources:
                 })
                 return
+            # Show selected sources
+            selected_sources_list = []
+            for i, source in enumerate(selected_sources, 1):
+                domain = urlparse(source['link']).netloc if source['link'] else "Unknown"
+                selected_sources_list.append(f"{i}. {source['title']} ({domain})")
+            yield format_sse({
+                "event": "sources_selected",
+                "data": {
+                    "selected_sources": selected_sources_list,
+                    "message": "Proceeding with in-depth analysis of these sources"
+                }
+            })
             # Step 3: Process selected sources with concurrency control
             semaphore = asyncio.Semaphore(MAX_CONCURRENT_REQUESTS)
             consolidated_context = ""
                 if elapsed > TOTAL_TIMEOUT * 0.8:  # Leave 20% of time for synthesis
                     yield format_sse({
                         "event": "status",
+                        "data": f"Approaching time limit, stopping source processing after {i}/{len(selected_sources)} sources"
                     })
                     break
                 if i > 0:
                     await asyncio.sleep(REQUEST_DELAY * 0.5)
+                # Notify about processing this source
+                domain = urlparse(source['link']).netloc if source['link'] else "Unknown"
+                yield format_sse({
+                    "event": "processing_source",
+                    "data": {
+                        "index": i + 1,
+                        "total": len(selected_sources),
+                        "title": source['title'],
+                        "domain": domain,
+                        "url": source['link']
+                    }
+                })
                 task = asyncio.create_task(process_with_semaphore(source))
                 processing_tasks.append(task)
             # Process completed tasks as they finish
             for future in asyncio.as_completed(processing_tasks):
                 processed_sources += 1
                 content, source_info = await future
                 if content and content.strip():
+                    # Report successful processing
+                    domain = urlparse(source_info['link']).netloc if source_info['link'] else "Unknown"
+                    word_count = len(content.split())
+                    yield format_sse({
+                        "event": "source_processed",
+                        "data": {
+                            "title": source_info['title'],
+                            "domain": domain,
+                            "word_count": word_count,
+                            "status": "success"
+                        }
+                    })
+                    # Add to our consolidated context
                     consolidated_context += f"Source: {source_info['link']}\nContent: {content}\n\n---\n\n"
                     all_sources_used.append(source_info)
                     successful_sources += 1
+                    total_tokens += word_count  # Add to token count
                 else:
                     processing_errors += 1
+                    yield format_sse({
+                        "event": "source_processed",
+                        "data": {
+                            "title": source_info['title'],
+                            "status": "failed",
+                            "reason": "Could not extract sufficient content"
+                        }
+                    })
             if not consolidated_context.strip():
                 yield format_sse({
                 })
                 return
+            # Report on processing results
+            yield format_sse({
+                "event": "status",
+                "data": f"Successfully processed {successful_sources} of {processed_sources} sources, extracting approximately {total_tokens} words of content"
+            })
             # Step 4: Synthesize comprehensive report
             time_remaining = max(0, TOTAL_TIMEOUT - (time.time() - start_time))
             yield format_sse({
                 "event": "status",
+                "data": f"Generating comprehensive analysis report from {successful_sources} sources..."
             })
             max_output_tokens = min(2000, int(time_remaining * 6))  # More aggressive token count
+            report_prompt = f"""Compose a comprehensive analysis report on "{query}".
             Structure the report with these sections:
+            1. Executive Summary
             2. Key Features and Capabilities
+            3. Comparative Analysis
+            4. Strengths and Weaknesses
+            5. Current Trends and