Spaces:

rkihacker
/

Scrap

Paused

App Files Files Community

rkihacker commited on Sep 15

Commit

ffce11c

verified ·

1 Parent(s): 64f616b

Update main.py

Browse files

Files changed (1) hide show

main.py +38 -37

main.py CHANGED Viewed

@@ -5,10 +5,11 @@ import logging
 import random
 import re
 from typing import AsyncGenerator, Optional, Tuple, List
-from urllib.parse import quote_plus
 from fastapi import FastAPI
 from fastapi.responses import StreamingResponse
 from pydantic import BaseModel
 from dotenv import load_dotenv
 import aiohttp
@@ -29,7 +30,7 @@ else:
 # --- Constants & Headers ---
 LLM_API_URL = "https://api.typegpt.net/v1/chat/completions"
 LLM_MODEL = "gpt-4.1-mini"
-MAX_SOURCES_TO_PROCESS = 15 # Increase research depth for longer reports
 # Real Browser User Agents for Rotation
 USER_AGENTS = [
@@ -43,6 +44,23 @@ LLM_HEADERS = {"Authorization": f"Bearer {LLM_API_KEY}", "Content-Type": "applic
 class DeepResearchRequest(BaseModel):
     query: str
 def extract_json_from_llm_response(text: str) -> Optional[list]:
     match = re.search(r'\[.*\]', text, re.DOTALL)
     if match:
@@ -50,26 +68,29 @@ def extract_json_from_llm_response(text: str) -> Optional[list]:
         except json.JSONDecodeError: return None
     return None
-app = FastAPI(
-    title="AI Deep Research API",
-    description="Provides robust, long-form, streaming deep research completions.",
-    version="6.0.0"  # Final Production Version
-)
 # --- Core Service Functions ---
 async def call_duckduckgo_search(session: aiohttp.ClientSession, query: str) -> List[dict]:
-    search_url = f"https://html.duckduckgo.com/html/?q={quote_plus(query)}"
     headers = {'User-Agent': random.choice(USER_AGENTS)}
     try:
         async with session.get(search_url, headers=headers, timeout=15) as response:
             response.raise_for_status(); html = await response.text()
             soup = BeautifulSoup(html, "html.parser"); results = []
             for res in soup.find_all('div', class_='result'):
-                title_tag, snippet_tag = res.find('a', class_='result__a'), res.find('a', class_='result__snippet')
                 if title_tag and snippet_tag and 'href' in title_tag.attrs:
-                    cleaned_link = re.sub(r'/l/\?kh=-1&uddg=', '', title_tag['href'])
-                    results.append({'title': title_tag.text, 'link': cleaned_link, 'snippet': snippet_tag.text})
-            logger.info(f"Found {len(results)} sources from DuckDuckGo for: '{query}'")
             return results
     except Exception as e:
         logger.error(f"DuckDuckGo search failed for query '{query}': {e}"); return []
@@ -77,6 +98,7 @@ async def call_duckduckgo_search(session: aiohttp.ClientSession, query: str) ->
 async def research_and_process_source(session: aiohttp.ClientSession, source: dict) -> Tuple[str, dict]:
     headers = {'User-Agent': random.choice(USER_AGENTS)}
     try:
         if source['link'].lower().endswith('.pdf'): raise ValueError("PDF content")
         async with session.get(source['link'], headers=headers, timeout=10, ssl=False) as response:
             if response.status != 200: raise ValueError(f"HTTP status {response.status}")
@@ -95,7 +117,6 @@ async def run_deep_research_stream(query: str) -> AsyncGenerator[str, None]:
     def format_sse(data: dict) -> str: return f"data: {json.dumps(data)}\n\n"
     try:
         async with aiohttp.ClientSession() as session:
-            # Step 1: Generate Research Plan
             yield format_sse({"event": "status", "data": "Generating research plan..."})
             plan_prompt = {"model": LLM_MODEL, "messages": [{"role": "user", "content": f"Generate 3-4 key sub-questions for a research report on '{query}'. Your response MUST be ONLY the raw JSON array. Example: [\"Question 1?\"]"}]}
             try:
@@ -108,17 +129,14 @@ async def run_deep_research_stream(query: str) -> AsyncGenerator[str, None]:
             yield format_sse({"event": "plan", "data": sub_questions})
-            # Step 2: Conduct Deep Research
             yield format_sse({"event": "status", "data": f"Searching sources for {len(sub_questions)} topics..."})
             search_tasks = [call_duckduckgo_search(session, sq) for sq in sub_questions]
             all_search_results = await asyncio.gather(*search_tasks)
             unique_sources = list({source['link']: source for results in all_search_results for source in results}.values())
             if not unique_sources:
                 yield format_sse({"event": "error", "data": "All search queries returned zero usable sources."}); return
-            # Limit the number of sources to process for very long reports
             sources_to_process = unique_sources[:MAX_SOURCES_TO_PROCESS]
             yield format_sse({"event": "status", "data": f"Found {len(unique_sources)} unique sources. Processing the top {len(sources_to_process)}..."})
@@ -134,23 +152,8 @@ async def run_deep_research_stream(query: str) -> AsyncGenerator[str, None]:
             if not consolidated_context.strip():
                 yield format_sse({"event": "error", "data": "Failed to gather any research context."}); return
-            # Step 3: Synthesize Long-Form Final Report
             yield format_sse({"event": "status", "data": "Synthesizing final report..."})
-            # ***** ENHANCED PROMPT FOR LONGEST POSSIBLE REPORT *****
-            report_prompt = f"""
-You are an expert research analyst. Your task is to synthesize the provided context into a long-form, comprehensive, multi-page report on the topic: "{query}".
-Follow these instructions carefully:
-1.  Write in a professional, academic tone.
-2.  Structure the report with a clear introduction, multiple detailed sections with sub-headings using Markdown, and a concluding summary.
-3.  Elaborate extensively on each point. Use multiple paragraphs for each section to explore the nuances of the topic.
-4.  Base your entire report *only* on the information provided in the context below. Do not use any external knowledge.
-5.  Aim for the most detailed and thorough report possible based on the given material.
-## Research Context ##
-{consolidated_context}
-"""
             report_payload = {"model": LLM_MODEL, "messages": [{"role": "user", "content": report_prompt}], "stream": True}
             async with session.post(LLM_API_URL, headers=LLM_HEADERS, json=report_payload) as response:
@@ -160,7 +163,7 @@ Follow these instructions carefully:
                     if line_str.startswith('data:'): line_str = line_str[5:].strip()
                     if line_str == "[DONE]": break
-                    # ***** FIX FOR 'list index out of range' ERROR *****
                     try:
                         chunk = json.loads(line_str)
                         choices = chunk.get("choices")
@@ -169,9 +172,8 @@ Follow these instructions carefully:
                             if content:
                                 yield format_sse({"event": "chunk", "data": content})
                     except json.JSONDecodeError:
-                        continue # Ignore malformed lines
-            # Final event with all source data
             yield format_sse({"event": "sources", "data": all_sources_used})
     except Exception as e:
         logger.error(f"A critical error occurred: {e}", exc_info=True)
@@ -179,7 +181,6 @@ Follow these instructions carefully:
     finally:
         yield format_sse({"event": "done", "data": "Deep research complete."})
-# --- API Endpoints ---
 @app.post("/v1/deepresearch/completions")
 async def deep_research_endpoint(request: DeepResearchRequest):
     return StreamingResponse(run_deep_research_stream(request.query), media_type="text/event-stream")

 import random
 import re
 from typing import AsyncGenerator, Optional, Tuple, List
+from urllib.parse import unquote
 from fastapi import FastAPI
 from fastapi.responses import StreamingResponse
+from fastapi.middleware.cors import CORSMiddleware
 from pydantic import BaseModel
 from dotenv import load_dotenv
 import aiohttp
 # --- Constants & Headers ---
 LLM_API_URL = "https://api.typegpt.net/v1/chat/completions"
 LLM_MODEL = "gpt-4.1-mini"
+MAX_SOURCES_TO_PROCESS = 15
 # Real Browser User Agents for Rotation
 USER_AGENTS = [
 class DeepResearchRequest(BaseModel):
     query: str
+app = FastAPI(
+    title="AI Deep Research API",
+    description="Provides robust, long-form, streaming deep research completions.",
+    version="7.0.0"  # Final Production Version
+)
+# ***** CHANGE 1: Enable CORS for all origins *****
+app.add_middleware(
+    CORSMiddleware,
+    allow_origins=["*"],  # Allows all origins
+    allow_credentials=True,
+    allow_methods=["*"],  # Allows all methods
+    allow_headers=["*"],  # Allows all headers
+)
+logger.info("CORS middleware enabled for all origins.")
+# --- Helper Functions ---
 def extract_json_from_llm_response(text: str) -> Optional[list]:
     match = re.search(r'\[.*\]', text, re.DOTALL)
     if match:
         except json.JSONDecodeError: return None
     return None
 # --- Core Service Functions ---
 async def call_duckduckgo_search(session: aiohttp.ClientSession, query: str) -> List[dict]:
+    search_url = f"https://html.duckduckgo.com/html/?q={query.replace(' ', '+')}"
     headers = {'User-Agent': random.choice(USER_AGENTS)}
     try:
         async with session.get(search_url, headers=headers, timeout=15) as response:
             response.raise_for_status(); html = await response.text()
             soup = BeautifulSoup(html, "html.parser"); results = []
             for res in soup.find_all('div', class_='result'):
+                title_tag = res.find('a', class_='result__a')
+                snippet_tag = res.find('a', class_='result__snippet')
                 if title_tag and snippet_tag and 'href' in title_tag.attrs:
+                    # ***** CHANGE 2: The critical fix for scraping. Decode the real URL. *****
+                    try:
+                        raw_link = title_tag['href']
+                        # The real URL is percent-encoded in the 'uddg' parameter
+                        actual_url = unquote(raw_link.split('uddg=')[1])
+                        if actual_url.startswith("http"):
+                            results.append({'title': title_tag.text, 'link': actual_url, 'snippet': snippet_tag.text})
+                    except IndexError:
+                        # This link format is unexpected, skip it
+                        continue
+            logger.info(f"Found {len(results)} valid sources from DuckDuckGo for: '{query}'")
             return results
     except Exception as e:
         logger.error(f"DuckDuckGo search failed for query '{query}': {e}"); return []
 async def research_and_process_source(session: aiohttp.ClientSession, source: dict) -> Tuple[str, dict]:
     headers = {'User-Agent': random.choice(USER_AGENTS)}
     try:
+        logger.info(f"Scraping: {source['link']}")
         if source['link'].lower().endswith('.pdf'): raise ValueError("PDF content")
         async with session.get(source['link'], headers=headers, timeout=10, ssl=False) as response:
             if response.status != 200: raise ValueError(f"HTTP status {response.status}")
     def format_sse(data: dict) -> str: return f"data: {json.dumps(data)}\n\n"
     try:
         async with aiohttp.ClientSession() as session:
             yield format_sse({"event": "status", "data": "Generating research plan..."})
             plan_prompt = {"model": LLM_MODEL, "messages": [{"role": "user", "content": f"Generate 3-4 key sub-questions for a research report on '{query}'. Your response MUST be ONLY the raw JSON array. Example: [\"Question 1?\"]"}]}
             try:
             yield format_sse({"event": "plan", "data": sub_questions})
             yield format_sse({"event": "status", "data": f"Searching sources for {len(sub_questions)} topics..."})
             search_tasks = [call_duckduckgo_search(session, sq) for sq in sub_questions]
             all_search_results = await asyncio.gather(*search_tasks)
             unique_sources = list({source['link']: source for results in all_search_results for source in results}.values())
             if not unique_sources:
                 yield format_sse({"event": "error", "data": "All search queries returned zero usable sources."}); return
             sources_to_process = unique_sources[:MAX_SOURCES_TO_PROCESS]
             yield format_sse({"event": "status", "data": f"Found {len(unique_sources)} unique sources. Processing the top {len(sources_to_process)}..."})
             if not consolidated_context.strip():
                 yield format_sse({"event": "error", "data": "Failed to gather any research context."}); return
             yield format_sse({"event": "status", "data": "Synthesizing final report..."})
+            report_prompt = f'Synthesize the provided context into a long-form, comprehensive, multi-page report on "{query}". Use markdown. Elaborate extensively on each point. Base your entire report ONLY on the provided context.\n\n## Research Context ##\n{consolidated_context}'
             report_payload = {"model": LLM_MODEL, "messages": [{"role": "user", "content": report_prompt}], "stream": True}
             async with session.post(LLM_API_URL, headers=LLM_HEADERS, json=report_payload) as response:
                     if line_str.startswith('data:'): line_str = line_str[5:].strip()
                     if line_str == "[DONE]": break
+                    # ***** CHANGE 3: The definitive fix for the 'list index out of range' error *****
                     try:
                         chunk = json.loads(line_str)
                         choices = chunk.get("choices")
                             if content:
                                 yield format_sse({"event": "chunk", "data": content})
                     except json.JSONDecodeError:
+                        continue
             yield format_sse({"event": "sources", "data": all_sources_used})
     except Exception as e:
         logger.error(f"A critical error occurred: {e}", exc_info=True)
     finally:
         yield format_sse({"event": "done", "data": "Deep research complete."})
 @app.post("/v1/deepresearch/completions")
 async def deep_research_endpoint(request: DeepResearchRequest):
     return StreamingResponse(run_deep_research_stream(request.query), media_type="text/event-stream")