Spaces:

rkihacker
/

Scrap

Paused

App Files Files Community

rkihacker commited on Sep 17

Commit

0359a50

verified ·

1 Parent(s): 6ac9507

Update main.py

Browse files

Files changed (1) hide show

main.py +69 -26

main.py CHANGED Viewed

@@ -16,7 +16,7 @@ from bs4 import BeautifulSoup
 # --- Configuration ---
 logging.basicConfig(level=logging.INFO, format='%(asctime)s - %(levelname)s - %(message)s')
-logger = aiohttp.log.access_logger # Use aiohttp's logger for better async context
 load_dotenv()
 LLM_API_KEY = os.getenv("LLM_API_KEY")
@@ -30,6 +30,7 @@ else:
 LLM_API_URL = "https://api.typegpt.net/v1/chat/completions"
 LLM_MODEL = "meta-llama/Llama-4-Maverick-17B-128E-Instruct-FP8"
 MAX_SOURCES_TO_PROCESS = 15
 # Real Browser User Agents for SCRAPING
 USER_AGENTS = [
@@ -45,8 +46,8 @@ class DeepResearchRequest(BaseModel):
 app = FastAPI(
     title="AI Deep Research API",
-    description="Provides robust, long-form, streaming deep research completions using a simulated search.",
-    version="10.0.0"  # Final: Using simulated search to bypass external blocking.
 )
 app.add_middleware(CORSMiddleware, allow_origins=["*"], allow_credentials=True, allow_methods=["*"], allow_headers=["*"])
@@ -58,33 +59,75 @@ def extract_json_from_llm_response(text: str) -> Optional[list]:
         except json.JSONDecodeError: return None
     return None
-async def call_duckduckgo_search(query: str, max_results: int = 10) -> List[dict]:
     """
-    Simulates a successful DuckDuckGo search to bypass anti-scraping measures.
-    This function returns a static, hardcoded list of relevant search results
-    for the topic "Nian" (Chinese New Year beast), allowing the rest of the
-    application pipeline to be tested.
     """
-    logging.info(f"Simulating search for: '{query}'")
-    # Static results related to "Nian" myth, as "niansuh" yields no results.
-    # This provides the scraper with valid URLs to process.
-    simulated_results = [
-        {'title': 'Nian - Wikipedia', 'link': 'https://en.wikipedia.org/wiki/Nian', 'snippet': 'The Nian is a beast from Chinese mythology. The Nian is said to have the body of a bull, the head of a lion with a single horn, and sharp teeth.'},
-        {'title': 'The Legend of Nian and the Origins of Chinese New Year', 'link': 'https://www.chinahighlights.com/travelguide/festivals/story-of-nian.htm', 'snippet': 'Learn about the monster Nian and how the traditions of wearing red, setting off firecrackers, and staying up late came to be part of Chinese New Year.'},
-        {'title': 'Nian: The Beast That Invented Chinese New Year - Culture Trip', 'link': 'https://theculturetrip.com/asia/china/articles/nian-the-beast-that-invented-chinese-new-year', 'snippet': 'Once a year, at the beginning of Chinese New Year, a beast named Nian would terrorize a small village in China, eating their crops, livestock, and children.'},
-        {'title': 'Chinese New Year mythology: The story of Nian - British Museum', 'link': 'https://www.britishmuseum.org/blog/chinese-new-year-mythology-story-nian', 'snippet': 'Discover the mythical origins of the Chinese New Year celebration and the fearsome beast, Nian.'},
-        {'title': 'Year of the Nian Monster - Asian Art Museum', 'link': 'https://education.asianart.org/resources/year-of-the-nian-monster/', 'snippet': 'A summary of the story of the Nian monster for educators and children, explaining the connection to modern traditions.'}
-    ]
-    logging.info(f"Returning {len(simulated_results)} static sources.")
-    return simulated_results[:max_results]
 async def research_and_process_source(session: aiohttp.ClientSession, source: dict) -> Tuple[str, dict]:
     headers = {'User-Agent': random.choice(USER_AGENTS)}
     try:
-        logging.info(f"Scraping: {source['link']}")
         if source['link'].lower().endswith('.pdf'): raise ValueError("PDF content")
         async with session.get(source['link'], headers=headers, timeout=10, ssl=False) as response:
             if response.status != 200: raise ValueError(f"HTTP status {response.status}")
@@ -95,7 +138,7 @@ async def research_and_process_source(session: aiohttp.ClientSession, source: di
             if not content.strip(): raise ValueError("Parsed content is empty.")
             return content, source
     except Exception as e:
-        logging.warning(f"Scraping failed for {source['link']} ({e}). Falling back to snippet.")
         return source.get('snippet', ''), source
 async def run_deep_research_stream(query: str) -> AsyncGenerator[str, None]:
@@ -114,13 +157,13 @@ async def run_deep_research_stream(query: str) -> AsyncGenerator[str, None]:
             yield format_sse({"event": "plan", "data": sub_questions})
-            yield format_sse({"event": "status", "data": f"Searching sources for {len(sub_questions)} topics..."})
-            search_tasks = [call_duckduckgo_search(sq) for sq in sub_questions]
             all_search_results = await asyncio.gather(*search_tasks)
             unique_sources = list({source['link']: source for results in all_search_results for source in results}.values())
             if not unique_sources:
-                yield format_sse({"event": "error", "data": "The simulated search returned no sources. Check the hardcoded list."}); return
             sources_to_process = unique_sources[:MAX_SOURCES_TO_PROCESS]
             yield format_sse({"event": "status", "data": f"Found {len(unique_sources)} unique sources. Processing the top {len(sources_to_process)}..."})
@@ -135,7 +178,7 @@ async def run_deep_research_stream(query: str) -> AsyncGenerator[str, None]:
                     all_sources_used.append(source_info)
             if not consolidated_context.strip():
-                yield format_sse({"event": "error", "data": "Failed to scrape content from any of the discovered sources."}); return
             yield format_sse({"event": "status", "data": "Synthesizing final report..."})
             report_prompt = f'Synthesize the provided context into a long-form, comprehensive, multi-page report on "{query}". Use markdown. Elaborate extensively on each point. Base your entire report ONLY on the provided context.\n\n## Research Context ##\n{consolidated_context}'

 # --- Configuration ---
 logging.basicConfig(level=logging.INFO, format='%(asctime)s - %(levelname)s - %(message)s')
+logger = logging.getLogger(__name__)
 load_dotenv()
 LLM_API_KEY = os.getenv("LLM_API_KEY")
 LLM_API_URL = "https://api.typegpt.net/v1/chat/completions"
 LLM_MODEL = "meta-llama/Llama-4-Maverick-17B-128E-Instruct-FP8"
 MAX_SOURCES_TO_PROCESS = 15
+SEARCH_PAGES_TO_FETCH = 2 # Fetch first 2 pages of results for each query
 # Real Browser User Agents for SCRAPING
 USER_AGENTS = [
 app = FastAPI(
     title="AI Deep Research API",
+    description="Provides robust, long-form, streaming deep research completions using a live, multi-page DuckDuckGo search.",
+    version="11.0.0"  # Implemented robust, multi-page live web search
 )
 app.add_middleware(CORSMiddleware, allow_origins=["*"], allow_credentials=True, allow_methods=["*"], allow_headers=["*"])
         except json.JSONDecodeError: return None
     return None
+def parse_search_results(soup: BeautifulSoup) -> List[dict]:
+    """Helper to parse results from a BeautifulSoup object."""
+    results = []
+    for result_div in soup.find_all('div', class_='result'):
+        title_elem = result_div.find('a', class_='result__a')
+        snippet_elem = result_div.find('a', class_='result__snippet')
+        if title_elem and snippet_elem:
+            link = title_elem.get('href')
+            title = title_elem.get_text(strip=True)
+            snippet = snippet_elem.get_text(strip=True)
+            if link and title and snippet:
+                results.append({'title': title, 'link': link, 'snippet': snippet})
+    return results
+async def call_duckduckgo_search(session: aiohttp.ClientSession, query: str, max_results: int = 15) -> List[dict]:
     """
+    Performs a robust, multi-page search on DuckDuckGo's HTML interface.
     """
+    logger.info(f"Starting multi-page search for: '{query}'")
+    search_url = "https://html.duckduckgo.com/html/"
+    headers = {
+        'Content-Type': 'application/x-www-form-urlencoded',
+        'User-Agent': random.choice(USER_AGENTS),
+        'Referer': 'https://html.duckduckgo.com/'
+    }
+    all_results = []
+    payload = {'q': query}
+    try:
+        for page in range(SEARCH_PAGES_TO_FETCH):
+            logger.info(f"Searching page {page + 1} for '{query}'...")
+            async with session.post(search_url, data=payload, headers=headers, timeout=15) as response:
+                if response.status != 200:
+                    logger.warning(f"Search for '{query}' page {page+1} returned status {response.status}. Stopping search for this query.")
+                    break
+                html = await response.text()
+                soup = BeautifulSoup(html, "html.parser")
+                page_results = parse_search_results(soup)
+                all_results.extend(page_results)
+                # Find the 'Next' form to get parameters for the next page request
+                next_form = soup.find('form', action='/html/', method='post', string=lambda t: t and 'Next' in t)
+                if not next_form:
+                    logger.info(f"No 'Next' page found for '{query}'. Ending search.")
+                    break
+                # Update payload with hidden inputs for the next page
+                payload = {inp.get('name'): inp.get('value') for inp in next_form.find_all('input')}
+                if not payload:
+                     logger.info(f"Could not find parameters for next page. Ending search.")
+                     break
+                await asyncio.sleep(random.uniform(0.5, 1.5)) # Small delay to mimic human behavior
+    except Exception as e:
+        logger.error(f"An error occurred during multi-page search for '{query}': {e}", exc_info=True)
+    logger.info(f"Found a total of {len(all_results)} sources from {SEARCH_PAGES_TO_FETCH} pages for: '{query}'")
+    return all_results[:max_results]
 async def research_and_process_source(session: aiohttp.ClientSession, source: dict) -> Tuple[str, dict]:
     headers = {'User-Agent': random.choice(USER_AGENTS)}
     try:
+        logger.info(f"Scraping: {source['link']}")
         if source['link'].lower().endswith('.pdf'): raise ValueError("PDF content")
         async with session.get(source['link'], headers=headers, timeout=10, ssl=False) as response:
             if response.status != 200: raise ValueError(f"HTTP status {response.status}")
             if not content.strip(): raise ValueError("Parsed content is empty.")
             return content, source
     except Exception as e:
+        logger.warning(f"Scraping failed for {source['link']} ({e}). Falling back to snippet.")
         return source.get('snippet', ''), source
 async def run_deep_research_stream(query: str) -> AsyncGenerator[str, None]:
             yield format_sse({"event": "plan", "data": sub_questions})
+            yield format_sse({"event": "status", "data": f"Performing deep search for {len(sub_questions)} topics..."})
+            search_tasks = [call_duckduckgo_search(session, sq) for sq in sub_questions]
             all_search_results = await asyncio.gather(*search_tasks)
             unique_sources = list({source['link']: source for results in all_search_results for source in results}.values())
             if not unique_sources:
+                yield format_sse({"event": "error", "data": f"The live multi-page search could not find any relevant sources for '{query}'. The topic might be too obscure."}); return
             sources_to_process = unique_sources[:MAX_SOURCES_TO_PROCESS]
             yield format_sse({"event": "status", "data": f"Found {len(unique_sources)} unique sources. Processing the top {len(sources_to_process)}..."})
                     all_sources_used.append(source_info)
             if not consolidated_context.strip():
+                yield format_sse({"event": "error", "data": "Found sources, but failed to scrape meaningful content from any of them."}); return
             yield format_sse({"event": "status", "data": "Synthesizing final report..."})
             report_prompt = f'Synthesize the provided context into a long-form, comprehensive, multi-page report on "{query}". Use markdown. Elaborate extensively on each point. Base your entire report ONLY on the provided context.\n\n## Research Context ##\n{consolidated_context}'