Spaces:

rkihacker
/

Scrap

Paused

App Files Files Community

rkihacker commited on Sep 15

Commit

3c9a1a6

verified ·

1 Parent(s): ffce11c

Update main.py

Browse files

Files changed (1) hide show

main.py +25 -40

main.py CHANGED Viewed

@@ -5,7 +5,6 @@ import logging
 import random
 import re
 from typing import AsyncGenerator, Optional, Tuple, List
-from urllib.parse import unquote
 from fastapi import FastAPI
 from fastapi.responses import StreamingResponse
@@ -28,11 +27,12 @@ else:
     logger.info("LLM API Key loaded successfully.")
 # --- Constants & Headers ---
 LLM_API_URL = "https://api.typegpt.net/v1/chat/completions"
 LLM_MODEL = "gpt-4.1-mini"
 MAX_SOURCES_TO_PROCESS = 15
-# Real Browser User Agents for Rotation
 USER_AGENTS = [
     "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/128.0.0.0 Safari/537.36",
     "Mozilla/5.0 (Macintosh; Intel Mac OS X 10_15_7) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/128.0.0.0 Safari/537.36",
@@ -46,19 +46,12 @@ class DeepResearchRequest(BaseModel):
 app = FastAPI(
     title="AI Deep Research API",
-    description="Provides robust, long-form, streaming deep research completions.",
-    version="7.0.0"  # Final Production Version
 )
-# ***** CHANGE 1: Enable CORS for all origins *****
-app.add_middleware(
-    CORSMiddleware,
-    allow_origins=["*"],  # Allows all origins
-    allow_credentials=True,
-    allow_methods=["*"],  # Allows all methods
-    allow_headers=["*"],  # Allows all headers
-)
-logger.info("CORS middleware enabled for all origins.")
 # --- Helper Functions ---
 def extract_json_from_llm_response(text: str) -> Optional[list]:
@@ -69,31 +62,26 @@ def extract_json_from_llm_response(text: str) -> Optional[list]:
     return None
 # --- Core Service Functions ---
-async def call_duckduckgo_search(session: aiohttp.ClientSession, query: str) -> List[dict]:
-    search_url = f"https://html.duckduckgo.com/html/?q={query.replace(' ', '+')}"
-    headers = {'User-Agent': random.choice(USER_AGENTS)}
     try:
-        async with session.get(search_url, headers=headers, timeout=15) as response:
-            response.raise_for_status(); html = await response.text()
-            soup = BeautifulSoup(html, "html.parser"); results = []
-            for res in soup.find_all('div', class_='result'):
-                title_tag = res.find('a', class_='result__a')
-                snippet_tag = res.find('a', class_='result__snippet')
-                if title_tag and snippet_tag and 'href' in title_tag.attrs:
-                    # ***** CHANGE 2: The critical fix for scraping. Decode the real URL. *****
-                    try:
-                        raw_link = title_tag['href']
-                        # The real URL is percent-encoded in the 'uddg' parameter
-                        actual_url = unquote(raw_link.split('uddg=')[1])
-                        if actual_url.startswith("http"):
-                            results.append({'title': title_tag.text, 'link': actual_url, 'snippet': snippet_tag.text})
-                    except IndexError:
-                        # This link format is unexpected, skip it
-                        continue
-            logger.info(f"Found {len(results)} valid sources from DuckDuckGo for: '{query}'")
             return results
     except Exception as e:
-        logger.error(f"DuckDuckGo search failed for query '{query}': {e}"); return []
 async def research_and_process_source(session: aiohttp.ClientSession, source: dict) -> Tuple[str, dict]:
     headers = {'User-Agent': random.choice(USER_AGENTS)}
@@ -130,7 +118,7 @@ async def run_deep_research_stream(query: str) -> AsyncGenerator[str, None]:
             yield format_sse({"event": "plan", "data": sub_questions})
             yield format_sse({"event": "status", "data": f"Searching sources for {len(sub_questions)} topics..."})
-            search_tasks = [call_duckduckgo_search(session, sq) for sq in sub_questions]
             all_search_results = await asyncio.gather(*search_tasks)
             unique_sources = list({source['link']: source for results in all_search_results for source in results}.values())
@@ -162,8 +150,6 @@ async def run_deep_research_stream(query: str) -> AsyncGenerator[str, None]:
                     line_str = line.decode('utf-8').strip()
                     if line_str.startswith('data:'): line_str = line_str[5:].strip()
                     if line_str == "[DONE]": break
-                    # ***** CHANGE 3: The definitive fix for the 'list index out of range' error *****
                     try:
                         chunk = json.loads(line_str)
                         choices = chunk.get("choices")
@@ -171,8 +157,7 @@ async def run_deep_research_stream(query: str) -> AsyncGenerator[str, None]:
                             content = choices[0].get("delta", {}).get("content")
                             if content:
                                 yield format_sse({"event": "chunk", "data": content})
-                    except json.JSONDecodeError:
-                        continue
             yield format_sse({"event": "sources", "data": all_sources_used})
     except Exception as e:

 import random
 import re
 from typing import AsyncGenerator, Optional, Tuple, List
 from fastapi import FastAPI
 from fastapi.responses import StreamingResponse
     logger.info("LLM API Key loaded successfully.")
 # --- Constants & Headers ---
+SEARCH_API_URL = "https://search.privateinstance.com/api/text" # The new search provider
 LLM_API_URL = "https://api.typegpt.net/v1/chat/completions"
 LLM_MODEL = "gpt-4.1-mini"
 MAX_SOURCES_TO_PROCESS = 15
+# Real Browser User Agents for SCRAPING
 USER_AGENTS = [
     "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/128.0.0.0 Safari/537.36",
     "Mozilla/5.0 (Macintosh; Intel Mac OS X 10_15_7) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/128.0.0.0 Safari/537.36",
 app = FastAPI(
     title="AI Deep Research API",
+    description="Provides robust, long-form, streaming deep research completions using the PrivateInstance Search API.",
+    version="8.0.0"  # Final Production Version with PrivateInstance API
 )
+# Enable CORS for all origins
+app.add_middleware(CORSMiddleware, allow_origins=["*"], allow_credentials=True, allow_methods=["*"], allow_headers=["*"])
 # --- Helper Functions ---
 def extract_json_from_llm_response(text: str) -> Optional[list]:
     return None
 # --- Core Service Functions ---
+async def call_privateinstance_search(session: aiohttp.ClientSession, query: str) -> List[dict]:
+    """Performs a search using the PrivateInstance Search API."""
+    params = {'q': query, 'max_results': 10}
+    logger.info(f"Searching PrivateInstance API for: '{query}'")
     try:
+        async with session.get(SEARCH_API_URL, params=params, timeout=15) as response:
+            response.raise_for_status()
+            data = await response.json()
+            # The API might return results in a list directly or under a 'results' key.
+            raw_results = data if isinstance(data, list) else data.get('results', [])
+            # Map the API's response keys to our internal format
+            results = [
+                {'title': r.get('title'), 'link': r.get('url'), 'snippet': r.get('description')}
+                for r in raw_results if r.get('url') and r.get('title') and r.get('description')
+            ]
+            logger.info(f"Found {len(results)} sources from PrivateInstance for: '{query}'")
             return results
     except Exception as e:
+        logger.error(f"PrivateInstance search failed for query '{query}': {e}"); return []
 async def research_and_process_source(session: aiohttp.ClientSession, source: dict) -> Tuple[str, dict]:
     headers = {'User-Agent': random.choice(USER_AGENTS)}
             yield format_sse({"event": "plan", "data": sub_questions})
             yield format_sse({"event": "status", "data": f"Searching sources for {len(sub_questions)} topics..."})
+            search_tasks = [call_privateinstance_search(session, sq) for sq in sub_questions]
             all_search_results = await asyncio.gather(*search_tasks)
             unique_sources = list({source['link']: source for results in all_search_results for source in results}.values())
                     line_str = line.decode('utf-8').strip()
                     if line_str.startswith('data:'): line_str = line_str[5:].strip()
                     if line_str == "[DONE]": break
                     try:
                         chunk = json.loads(line_str)
                         choices = chunk.get("choices")
                             content = choices[0].get("delta", {}).get("content")
                             if content:
                                 yield format_sse({"event": "chunk", "data": content})
+                    except json.JSONDecodeError: continue
             yield format_sse({"event": "sources", "data": all_sources_used})
     except Exception as e: