Spaces:

rkihacker
/

Scrap

Paused

App Files Files Community

rkihacker commited on Sep 17

Commit

b7afcad

verified ·

1 Parent(s): 427157a

Update main.py

Browse files

Files changed (1) hide show

main.py +30 -48

main.py CHANGED Viewed

@@ -33,11 +33,10 @@ else:
 # --- Constants & Headers ---
 LLM_API_URL = "https://api.typegpt.net/v1/chat/completions"
 LLM_MODEL = "meta-llama/Llama-4-Maverick-17B-128E-Instruct-FP8"
-MAX_SOURCES_TO_PROCESS = 6  # Reduced to stay within time limits with real requests
-MAX_CONCURRENT_REQUESTS = 3  # Be conservative with real websites
 RESEARCH_TIMEOUT = 180  # 3 minutes maximum
-REQUEST_DELAY = 2.0  # Longer delay between requests to be more polite
-USER_AGENT_ROTATION = True
 # Initialize fake user agent generator
 try:
@@ -65,7 +64,7 @@ class DeepResearchRequest(BaseModel):
 app = FastAPI(
     title="AI Deep Research API",
     description="Provides robust, long-form, streaming deep research completions using real web searches.",
-    version="2.1.0"  # Updated version
 )
 app.add_middleware(
     CORSMiddleware,
@@ -87,18 +86,21 @@ def extract_json_from_llm_response(text: str) -> Optional[list]:
 async def get_real_user_agent() -> str:
     """Get a realistic user agent string."""
-    if USER_AGENT_ROTATION:
-        return ua.random()
-    return "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/128.0.0.0 Safari/537.36"
 async def check_robots_txt(url: str) -> bool:
     """Check if scraping is allowed by robots.txt."""
     try:
-        domain = re.search(r'https?://([^/]+)', url)
-        if not domain:
             return False
-        domain = domain.group(1)
         robots_url = f"https://{domain}/robots.txt"
         async with aiohttp.ClientSession() as session:
@@ -106,22 +108,20 @@ async def check_robots_txt(url: str) -> bool:
             async with session.get(robots_url, headers=headers, timeout=5) as response:
                 if response.status == 200:
                     robots = await response.text()
-                    # Simple check - disallow all if present
                     if "Disallow: /" in robots:
                         return False
-                    # Check for specific disallow rules for our path
                     path = re.sub(r'https?://[^/]+', '', url)
-                    if f"Disallow: {path}" in robots:
                         return False
         return True
     except Exception as e:
         logging.warning(f"Could not check robots.txt for {url}: {e}")
-        return False  # Default to not scraping if we can't check
 async def fetch_search_results(query: str, max_results: int = 5) -> List[dict]:
     """
     Perform a real search using DuckDuckGo's HTML interface.
-    Note: This may break if DuckDuckGo changes their HTML structure.
     """
     try:
         search_url = f"https://html.duckduckgo.com/html/?q={quote_plus(query)}"
@@ -144,20 +144,18 @@ async def fetch_search_results(query: str, max_results: int = 5) -> List[dict]:
                 results = []
                 # Updated selectors for DuckDuckGo's current HTML structure
-                for result in soup.select('.result')[:max_results]:
                     try:
                         title_elem = result.select_one('.result__title .result__a')
                         link_elem = title_elem if title_elem else result.select_one('a')
                         snippet_elem = result.select_one('.result__snippet')
                         if title_elem and link_elem and snippet_elem:
-                            # Clean up the URL
                             link = link_elem['href']
                             if link.startswith('/l/'):
-                                # DuckDuckGo returns relative links that redirect
-                                # We need to follow these to get the actual URL
                                 try:
-                                    redirect_url = f"https://duckduckgo.com{link}"
                                     async with session.get(redirect_url, headers=headers, timeout=5, allow_redirects=False) as redirect_resp:
                                         if redirect_resp.status == 302:
                                             link = redirect_resp.headers.get('Location', link)
@@ -176,7 +174,6 @@ async def fetch_search_results(query: str, max_results: int = 5) -> List[dict]:
                 logging.info(f"Found {len(results)} real search results for '{query}'")
                 return results
     except Exception as e:
         logging.error(f"Real search failed: {e}")
         return []
@@ -223,7 +220,6 @@ async def process_web_source(session: aiohttp.ClientSession, source: dict, timeo
                 tag.decompose()
             # Try to find main content by common patterns
-            main_content = None
             selectors_to_try = [
                 'main',
                 'article',
@@ -236,6 +232,7 @@ async def process_web_source(session: aiohttp.ClientSession, source: dict, timeo
                 '#content'
             ]
             for selector in selectors_to_try:
                 main_content = soup.select_one(selector)
                 if main_content:
@@ -244,10 +241,8 @@ async def process_web_source(session: aiohttp.ClientSession, source: dict, timeo
             if not main_content:
                 # If no main content found, try to find the largest text block
                 all_elements = soup.find_all()
-                # Filter out elements that are likely not main content
                 candidates = [el for el in all_elements if el.name not in ['script', 'style', 'nav', 'footer', 'header']]
                 if candidates:
-                    # Sort by text length
                     candidates.sort(key=lambda x: len(x.get_text()), reverse=True)
                     main_content = candidates[0] if candidates else soup
@@ -270,7 +265,7 @@ async def process_web_source(session: aiohttp.ClientSession, source: dict, timeo
                     content = " ".join(soup.stripped_strings)
                     content = re.sub(r'\s+', ' ', content).strip()
-            if len(content.split()) < 30:  # Minimum threshold for useful content
                 logging.warning(f"Very little content extracted from {source['link']}")
                 return source.get('snippet', ''), source_info
@@ -312,25 +307,22 @@ async def generate_research_plan(query: str, session: aiohttp.ClientSession) ->
                 content = result['choices'][0]['message']['content']
                 sub_questions = extract_json_from_llm_response(content)
                 if sub_questions and isinstance(sub_questions, list):
-                    # Clean up the questions
                     cleaned = []
                     for q in sub_questions:
                         if isinstance(q, str) and q.strip():
                             cleaned_q = re.sub(r'^[^a-zA-Z0-9]*|[^a-zA-Z0-9]*$', '', q)
                             if cleaned_q:
                                 cleaned.append(cleaned_q)
-                    return cleaned[:5]  # Limit to 5 questions max
         # Fallback if we couldn't get good questions from LLM
-        default_questions = [
             f"What is {query} and its key characteristics?",
             f"What are the main aspects or components of {query}?",
             f"What is the history and development of {query}?",
             f"What are the current trends or recent developments in {query}?",
             f"What are common challenges or controversies related to {query}?"
         ]
-        return default_questions[:4]
     except Exception as e:
         logging.error(f"Failed to generate research plan: {e}")
         return [
@@ -416,7 +408,6 @@ async def run_deep_research_stream(query: str) -> AsyncGenerator[str, None]:
                 "data": f"Found {len(unique_sources)} unique sources to process."
             })
-            # If we have no sources, return early
             if not unique_sources:
                 yield format_sse({
                     "event": "error",
@@ -439,7 +430,7 @@ async def run_deep_research_stream(query: str) -> AsyncGenerator[str, None]:
             for i, source in enumerate(unique_sources):
                 # Check if we're running out of time
                 elapsed = time.time() - start_time
-                if elapsed > RESEARCH_TIMEOUT * 0.7:  # Leave 30% of time for synthesis
                     yield format_sse({
                         "event": "status",
                         "data": f"Approaching time limit, stopping source processing at {i}/{len(unique_sources)}"
@@ -448,12 +439,11 @@ async def run_deep_research_stream(query: str) -> AsyncGenerator[str, None]:
                 # Add delay between processing each source to be polite
                 if i > 0:
-                    await asyncio.sleep(REQUEST_DELAY * 0.5)  # Shorter delay between same-domain requests
                 task = asyncio.create_task(process_with_semaphore(source))
                 processing_tasks.append(task)
-                # Yield progress updates periodically
                 if (i + 1) % 2 == 0 or (i + 1) == len(unique_sources):
                     yield format_sse({
                         "event": "status",
@@ -465,11 +455,10 @@ async def run_deep_research_stream(query: str) -> AsyncGenerator[str, None]:
                 processed_sources += 1
                 content, source_info = await future
                 if content and content.strip():
-                    # Add source content to our consolidated context
                     consolidated_context += f"Source: {source_info['link']}\nContent: {content}\n\n---\n\n"
                     all_sources_used.append(source_info)
                     successful_sources += 1
-                    total_tokens += len(content.split())  # Rough token count
                 else:
                     processing_errors += 1
@@ -480,14 +469,13 @@ async def run_deep_research_stream(query: str) -> AsyncGenerator[str, None]:
                 })
                 return
-            # Step 4: Synthesize report with improved prompt
             time_remaining = max(0, RESEARCH_TIMEOUT - (time.time() - start_time))
             yield format_sse({
                 "event": "status",
                 "data": f"Synthesizing report with content from {successful_sources} sources..."
             })
-            # Estimate how many tokens we can generate based on remaining time
             max_output_tokens = min(1500, int(time_remaining * 5))
             report_prompt = f"""Compose a comprehensive research report on "{query}".
@@ -496,16 +484,15 @@ async def run_deep_research_stream(query: str) -> AsyncGenerator[str, None]:
             Key requirements:
             1. Start with an introduction that explains what {query} is and why it's important
-            2. Include well-organized sections with clear headings based on the research questions
             3. Cite specific information from sources where appropriate
             4. End with a conclusion that summarizes key findings and insights
             5. Keep the report concise but comprehensive
             Available information (summarized from {successful_sources} sources):
-            {consolidated_context[:18000]}  # Increased context size but still limited
-            Generate a report that is approximately {max_output_tokens//4} words long (about {max_output_tokens//4//200} paragraphs).
-            Focus on the most important and relevant information.
             """
             report_payload = {
@@ -515,11 +502,9 @@ async def run_deep_research_stream(query: str) -> AsyncGenerator[str, None]:
                 "max_tokens": max_output_tokens
             }
-            # Stream the report generation
             async with session.post(LLM_API_URL, headers=LLM_HEADERS, json=report_payload) as response:
                 response.raise_for_status()
                 async for line in response.content:
-                    # Check if we're running out of time
                     if time.time() - start_time > RESEARCH_TIMEOUT:
                         yield format_sse({
                             "event": "warning",
@@ -539,13 +524,10 @@ async def run_deep_research_stream(query: str) -> AsyncGenerator[str, None]:
                             content = choices[0].get("delta", {}).get("content")
                             if content:
                                 yield format_sse({"event": "chunk", "data": content})
-                    except json.JSONDecodeError:
-                        continue
                     except Exception as e:
                         logging.warning(f"Error processing stream chunk: {e}")
                         continue
-            # Final status update
             duration = time.time() - start_time
             stats = {
                 "total_time_seconds": round(duration),

 # --- Constants & Headers ---
 LLM_API_URL = "https://api.typegpt.net/v1/chat/completions"
 LLM_MODEL = "meta-llama/Llama-4-Maverick-17B-128E-Instruct-FP8"
+MAX_SOURCES_TO_PROCESS = 6
+MAX_CONCURRENT_REQUESTS = 3
 RESEARCH_TIMEOUT = 180  # 3 minutes maximum
+REQUEST_DELAY = 2.0
 # Initialize fake user agent generator
 try:
 app = FastAPI(
     title="AI Deep Research API",
     description="Provides robust, long-form, streaming deep research completions using real web searches.",
+    version="2.1.0"
 )
 app.add_middleware(
     CORSMiddleware,
 async def get_real_user_agent() -> str:
     """Get a realistic user agent string."""
+    try:
+        if isinstance(ua, UserAgent):
+            return ua.random
+        return ua.random()  # For our fallback class
+    except:
+        return "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/128.0.0.0 Safari/537.36"
 async def check_robots_txt(url: str) -> bool:
     """Check if scraping is allowed by robots.txt."""
     try:
+        domain_match = re.search(r'https?://([^/]+)', url)
+        if not domain_match:
             return False
+        domain = domain_match.group(1)
         robots_url = f"https://{domain}/robots.txt"
         async with aiohttp.ClientSession() as session:
             async with session.get(robots_url, headers=headers, timeout=5) as response:
                 if response.status == 200:
                     robots = await response.text()
                     if "Disallow: /" in robots:
                         return False
+                    # Check for specific path disallows
                     path = re.sub(r'https?://[^/]+', '', url)
+                    if any(f"Disallow: {p}" in robots for p in [path, path.rstrip('/') + '/']):
                         return False
         return True
     except Exception as e:
         logging.warning(f"Could not check robots.txt for {url}: {e}")
+        return False
 async def fetch_search_results(query: str, max_results: int = 5) -> List[dict]:
     """
     Perform a real search using DuckDuckGo's HTML interface.
     """
     try:
         search_url = f"https://html.duckduckgo.com/html/?q={quote_plus(query)}"
                 results = []
                 # Updated selectors for DuckDuckGo's current HTML structure
+                for result in soup.select('.result__body')[:max_results]:
                     try:
                         title_elem = result.select_one('.result__title .result__a')
                         link_elem = title_elem if title_elem else result.select_one('a')
                         snippet_elem = result.select_one('.result__snippet')
                         if title_elem and link_elem and snippet_elem:
+                            # Handle DuckDuckGo's redirect URLs
                             link = link_elem['href']
                             if link.startswith('/l/'):
+                                redirect_url = f"https://duckduckgo.com{link}"
                                 try:
                                     async with session.get(redirect_url, headers=headers, timeout=5, allow_redirects=False) as redirect_resp:
                                         if redirect_resp.status == 302:
                                             link = redirect_resp.headers.get('Location', link)
                 logging.info(f"Found {len(results)} real search results for '{query}'")
                 return results
     except Exception as e:
         logging.error(f"Real search failed: {e}")
         return []
                 tag.decompose()
             # Try to find main content by common patterns
             selectors_to_try = [
                 'main',
                 'article',
                 '#content'
             ]
+            main_content = None
             for selector in selectors_to_try:
                 main_content = soup.select_one(selector)
                 if main_content:
             if not main_content:
                 # If no main content found, try to find the largest text block
                 all_elements = soup.find_all()
                 candidates = [el for el in all_elements if el.name not in ['script', 'style', 'nav', 'footer', 'header']]
                 if candidates:
                     candidates.sort(key=lambda x: len(x.get_text()), reverse=True)
                     main_content = candidates[0] if candidates else soup
                     content = " ".join(soup.stripped_strings)
                     content = re.sub(r'\s+', ' ', content).strip()
+            if len(content.split()) < 30:
                 logging.warning(f"Very little content extracted from {source['link']}")
                 return source.get('snippet', ''), source_info
                 content = result['choices'][0]['message']['content']
                 sub_questions = extract_json_from_llm_response(content)
                 if sub_questions and isinstance(sub_questions, list):
                     cleaned = []
                     for q in sub_questions:
                         if isinstance(q, str) and q.strip():
                             cleaned_q = re.sub(r'^[^a-zA-Z0-9]*|[^a-zA-Z0-9]*$', '', q)
                             if cleaned_q:
                                 cleaned.append(cleaned_q)
+                    return cleaned[:5]
         # Fallback if we couldn't get good questions from LLM
+        return [
             f"What is {query} and its key characteristics?",
             f"What are the main aspects or components of {query}?",
             f"What is the history and development of {query}?",
             f"What are the current trends or recent developments in {query}?",
             f"What are common challenges or controversies related to {query}?"
         ]
     except Exception as e:
         logging.error(f"Failed to generate research plan: {e}")
         return [
                 "data": f"Found {len(unique_sources)} unique sources to process."
             })
             if not unique_sources:
                 yield format_sse({
                     "event": "error",
             for i, source in enumerate(unique_sources):
                 # Check if we're running out of time
                 elapsed = time.time() - start_time
+                if elapsed > RESEARCH_TIMEOUT * 0.7:
                     yield format_sse({
                         "event": "status",
                         "data": f"Approaching time limit, stopping source processing at {i}/{len(unique_sources)}"
                 # Add delay between processing each source to be polite
                 if i > 0:
+                    await asyncio.sleep(REQUEST_DELAY * 0.5)
                 task = asyncio.create_task(process_with_semaphore(source))
                 processing_tasks.append(task)
                 if (i + 1) % 2 == 0 or (i + 1) == len(unique_sources):
                     yield format_sse({
                         "event": "status",
                 processed_sources += 1
                 content, source_info = await future
                 if content and content.strip():
                     consolidated_context += f"Source: {source_info['link']}\nContent: {content}\n\n---\n\n"
                     all_sources_used.append(source_info)
                     successful_sources += 1
+                    total_tokens += len(content.split())
                 else:
                     processing_errors += 1
                 })
                 return
+            # Step 4: Synthesize report
             time_remaining = max(0, RESEARCH_TIMEOUT - (time.time() - start_time))
             yield format_sse({
                 "event": "status",
                 "data": f"Synthesizing report with content from {successful_sources} sources..."
             })
             max_output_tokens = min(1500, int(time_remaining * 5))
             report_prompt = f"""Compose a comprehensive research report on "{query}".
             Key requirements:
             1. Start with an introduction that explains what {query} is and why it's important
+            2. Include well-organized sections with clear headings
             3. Cite specific information from sources where appropriate
             4. End with a conclusion that summarizes key findings and insights
             5. Keep the report concise but comprehensive
             Available information (summarized from {successful_sources} sources):
+            {consolidated_context[:18000]}
+            Generate a report that is approximately {max_output_tokens//4} words long.
             """
             report_payload = {
                 "max_tokens": max_output_tokens
             }
             async with session.post(LLM_API_URL, headers=LLM_HEADERS, json=report_payload) as response:
                 response.raise_for_status()
                 async for line in response.content:
                     if time.time() - start_time > RESEARCH_TIMEOUT:
                         yield format_sse({
                             "event": "warning",
                             content = choices[0].get("delta", {}).get("content")
                             if content:
                                 yield format_sse({"event": "chunk", "data": content})
                     except Exception as e:
                         logging.warning(f"Error processing stream chunk: {e}")
                         continue
             duration = time.time() - start_time
             stats = {
                 "total_time_seconds": round(duration),