Spaces:

rkihacker
/

Scrap

Paused

App Files Files Community

rkihacker commited on Sep 17

Commit

ad8dd04

verified ·

1 Parent(s): 58de22e

Update main.py

Browse files

Files changed (1) hide show

main.py +46 -23

main.py CHANGED Viewed

@@ -6,7 +6,7 @@ import random
 import re
 import time
 from typing import AsyncGenerator, Optional, Tuple, List, Dict
-from urllib.parse import quote_plus, urlparse
 from fastapi import FastAPI, HTTPException
 from fastapi.responses import StreamingResponse
 from fastapi.middleware.cors import CORSMiddleware
@@ -35,13 +35,13 @@ else:
 LLM_API_URL = "https://api.typegpt.net/v1/chat/completions"
 LLM_MODEL = "meta-llama/Llama-4-Maverick-17B-128E-Instruct-FP8"
 MAX_SOURCES_TO_PROCESS = 10
-MAX_CONCURRENT_REQUESTS = 3  # Reduced to avoid rate-limiting
-SEARCH_TIMEOUT = 90  # Reduced to ensure time for processing
 TOTAL_TIMEOUT = 180
-REQUEST_DELAY = 2.0  # Increased delay to avoid rate-limiting
 USER_AGENT_ROTATION = True
-RETRY_ATTEMPTS = 3  # Number of retries for failed search requests
-RETRY_DELAY = 3.0  # Delay between retries
 # Initialize fake user agent generator
 try:
@@ -64,7 +64,7 @@ LLM_HEADERS = {
 class DeepResearchRequest(BaseModel):
     query: str
-    search_time: int = 90  # Default to 90 seconds
 app = FastAPI(
     title="AI Deep Research API",
@@ -103,7 +103,6 @@ def clean_url(url: str) -> str:
     if not url:
         return ""
-    # Handle DuckDuckGo redirect URLs
     if url.startswith('//duckduckgo.com/l/'):
         url = f"https:{url}"
         try:
@@ -112,17 +111,10 @@ def clean_url(url: str) -> str:
             if 'uddg=' in query_params:
                 match = re.search(r'uddg=([^&]+)', query_params)
                 if match:
-                    encoded_url = match.group(1)
-                    try:
-                        # Properly decode the URL
-                        from urllib.parse import unquote
-                        return unquote(encoded_url)
-                    except:
-                        pass
         except:
             pass
-    # Ensure URL has proper scheme
     if url.startswith('//'):
         url = 'https:' + url
     elif not url.startswith(('http://', 'https://')):
@@ -157,7 +149,7 @@ async def check_robots_txt(url: str) -> bool:
 async def fetch_search_results(query: str, max_results: int = 5) -> List[dict]:
     """
-    Perform a real search using DuckDuckGo's HTML interface with retry logic.
     """
     headers = {
         "User-Agent": await get_real_user_agent(),
@@ -218,7 +210,7 @@ async def fetch_search_results(query: str, max_results: int = 5) -> List[dict]:
             logging.error(f"Search attempt {attempt + 1} failed for '{query}': {e}")
             if attempt < RETRY_ATTEMPTS - 1:
                 await asyncio.sleep(RETRY_DELAY)
-            continue
     logging.error(f"All {RETRY_ATTEMPTS} search attempts failed for '{query}'")
     return []
@@ -376,6 +368,10 @@ async def generate_research_plan(query: str, session: aiohttp.ClientSession) ->
         ]
     except Exception as e:
         logging.error(f"Failed to generate research plan: {e}")
         return [
             f"What is {query}?",
             f"What are the key aspects of {query}?",
@@ -390,6 +386,7 @@ async def continuous_search(query: str, search_time: int = 90) -> List[dict]:
     start_time = time.time()
     all_results = []
     seen_urls = set()
     query_variations = [
         query,
@@ -405,7 +402,7 @@ async def continuous_search(query: str, search_time: int = 90) -> List[dict]:
         while time.time() - start_time < search_time:
             iteration += 1
             random.shuffle(query_variations)
-            for q in query_variations[:3]:
                 if time.time() - start_time >= search_time:
                     logger.info(f"Search timed out after {search_time} seconds. Found {len(all_results)} results.")
                     break
@@ -421,6 +418,7 @@ async def continuous_search(query: str, search_time: int = 90) -> List[dict]:
                             result['link'] = clean_link
                             all_results.append(result)
                             logger.info(f"Added new result: {result['title']} ({result['link']})")
                     await asyncio.sleep(REQUEST_DELAY)
                     if len(all_results) >= MAX_SOURCES_TO_PROCESS * 1.5:
@@ -435,6 +433,11 @@ async def continuous_search(query: str, search_time: int = 90) -> List[dict]:
         logger.info(f"Completed continuous search. Total results: {len(all_results)}")
     if all_results:
         def score_result(result):
             query_terms = set(query.lower().split())
@@ -606,6 +609,10 @@ async def run_deep_research_stream(query: str, search_time: int = 90) -> AsyncGe
                     })
                 else:
                     processing_errors += 1
             if not consolidated_context.strip():
                 yield format_sse({
@@ -655,7 +662,14 @@ async def run_deep_research_stream(query: str, search_time: int = 90) -> AsyncGe
             }
             async with session.post(LLM_API_URL, headers=LLM_HEADERS, json=report_payload) as response:
-                response.raise_for_status()
                 async for line in response.content:
                     if time.time() - start_time > TOTAL_TIMEOUT:
                         yield format_sse({
@@ -668,6 +682,8 @@ async def run_deep_research_stream(query: str, search_time: int = 90) -> AsyncGe
                     if line_str.startswith('data:'):
                         line_str = line_str[5:].strip()
                     if line_str == "[DONE]":
                         break
                     try:
                         chunk = json.loads(line_str)
@@ -675,11 +691,17 @@ async def run_deep_research_stream(query: str, search_time: int = 90) -> AsyncGe
                         if choices and isinstance(choices, list) and len(choices) > 0:
                             content = choices[0].get("delta", {}).get("content")
                             if content:
-                                yield format_sse({"event": "chunk", "data": content})
                     except Exception as e:
                         logging.warning(f"Error processing stream chunk: {e}")
                         continue
             duration = time.time() - start_time
             stats = {
                 "total_time_seconds": round(duration),
@@ -722,9 +744,10 @@ async def deep_research_endpoint(request: DeepResearchRequest):
     search_time = min(max(request.search_time, 60), 180)
     return StreamingResponse(
         run_deep_research_stream(request.query.strip(), search_time),
-        media_type="text/event-stream"
     )
 if __name__ == "__main__":
     import uvicorn
-    uvicorn.run(app, host="0.0.0.0", port=8000)

 import re
 import time
 from typing import AsyncGenerator, Optional, Tuple, List, Dict
+from urllib.parse import quote_plus, urlparse, unquote
 from fastapi import FastAPI, HTTPException
 from fastapi.responses import StreamingResponse
 from fastapi.middleware.cors import CORSMiddleware
 LLM_API_URL = "https://api.typegpt.net/v1/chat/completions"
 LLM_MODEL = "meta-llama/Llama-4-Maverick-17B-128E-Instruct-FP8"
 MAX_SOURCES_TO_PROCESS = 10
+MAX_CONCURRENT_REQUESTS = 2  # Further reduced to avoid rate-limiting
+SEARCH_TIMEOUT = 90
 TOTAL_TIMEOUT = 180
+REQUEST_DELAY = 3.0  # Increased to avoid rate-limiting
+RETRY_ATTEMPTS = 5  # Increased retry attempts
+RETRY_DELAY = 5.0  # Increased delay between retries
 USER_AGENT_ROTATION = True
 # Initialize fake user agent generator
 try:
 class DeepResearchRequest(BaseModel):
     query: str
+    search_time: int = 90
 app = FastAPI(
     title="AI Deep Research API",
     if not url:
         return ""
     if url.startswith('//duckduckgo.com/l/'):
         url = f"https:{url}"
         try:
             if 'uddg=' in query_params:
                 match = re.search(r'uddg=([^&]+)', query_params)
                 if match:
+                    return unquote(match.group(1))
         except:
             pass
     if url.startswith('//'):
         url = 'https:' + url
     elif not url.startswith(('http://', 'https://')):
 async def fetch_search_results(query: str, max_results: int = 5) -> List[dict]:
     """
+    Perform a real search using DuckDuckGo's HTML interface with robust retry logic.
     """
     headers = {
         "User-Agent": await get_real_user_agent(),
             logging.error(f"Search attempt {attempt + 1} failed for '{query}': {e}")
             if attempt < RETRY_ATTEMPTS - 1:
                 await asyncio.sleep(RETRY_DELAY)
+                continue
     logging.error(f"All {RETRY_ATTEMPTS} search attempts failed for '{query}'")
     return []
         ]
     except Exception as e:
         logging.error(f"Failed to generate research plan: {e}")
+        yield format_sse({
+            "event": "error",
+            "data": f"Failed to generate research plan: {str(e)[:200]}"
+        })
         return [
             f"What is {query}?",
             f"What are the key aspects of {query}?",
     start_time = time.time()
     all_results = []
     seen_urls = set()
+    fallback_results = []
     query_variations = [
         query,
         while time.time() - start_time < search_time:
             iteration += 1
             random.shuffle(query_variations)
+            for q in query_variations:
                 if time.time() - start_time >= search_time:
                     logger.info(f"Search timed out after {search_time} seconds. Found {len(all_results)} results.")
                     break
                             result['link'] = clean_link
                             all_results.append(result)
                             logger.info(f"Added new result: {result['title']} ({result['link']})")
+                            fallback_results.append(result)  # Store for fallback
                     await asyncio.sleep(REQUEST_DELAY)
                     if len(all_results) >= MAX_SOURCES_TO_PROCESS * 1.5:
         logger.info(f"Completed continuous search. Total results: {len(all_results)}")
+    # Fallback if insufficient results
+    if len(all_results) < MAX_SOURCES_TO_PROCESS:
+        logger.warning(f"Insufficient results ({len(all_results)}), using fallback results")
+        all_results.extend(fallback_results[:MAX_SOURCES_TO_PROCESS - len(all_results)])
     if all_results:
         def score_result(result):
             query_terms = set(query.lower().split())
                     })
                 else:
                     processing_errors += 1
+                    yield format_sse({
+                        "event": "warning",
+                        "data": f"Failed to extract content from {source_info['link']}"
+                    })
             if not consolidated_context.strip():
                 yield format_sse({
             }
             async with session.post(LLM_API_URL, headers=LLM_HEADERS, json=report_payload) as response:
+                if response.status != 200:
+                    yield format_sse({
+                        "event": "error",
+                        "data": f"Failed to generate report: HTTP {response.status}"
+                    })
+                    return
+                buffer = ""
                 async for line in response.content:
                     if time.time() - start_time > TOTAL_TIMEOUT:
                         yield format_sse({
                     if line_str.startswith('data:'):
                         line_str = line_str[5:].strip()
                     if line_str == "[DONE]":
+                        if buffer:
+                            yield format_sse({"event": "chunk", "data": buffer})
                         break
                     try:
                         chunk = json.loads(line_str)
                         if choices and isinstance(choices, list) and len(choices) > 0:
                             content = choices[0].get("delta", {}).get("content")
                             if content:
+                                buffer += content
+                                if len(buffer) > 100:  # Flush buffer periodically
+                                    yield format_sse({"event": "chunk", "data": buffer})
+                                    buffer = ""
                     except Exception as e:
                         logging.warning(f"Error processing stream chunk: {e}")
                         continue
+                if buffer:
+                    yield format_sse({"event": "chunk", "data": buffer})
             duration = time.time() - start_time
             stats = {
                 "total_time_seconds": round(duration),
     search_time = min(max(request.search_time, 60), 180)
     return StreamingResponse(
         run_deep_research_stream(request.query.strip(), search_time),
+        media_type="text/event-stream",
+        headers={"Cache-Control": "no-cache", "Connection": "keep-alive"}
     )
 if __name__ == "__main__":
     import uvicorn
+    uvicorn.run(app, host="0.0.0.0", port=8000)