Scrap / main.py
rkihacker's picture
Update main.py
0eacd1e verified
raw
history blame
9.66 kB
import os
import asyncio
import json
import logging
import random
import re
from typing import AsyncGenerator, Optional, Tuple, List
from fastapi import FastAPI
from fastapi.responses import StreamingResponse
from fastapi.middleware.cors import CORSMiddleware
from pydantic import BaseModel
from dotenv import load_dotenv
import aiohttp
from bs4 import BeautifulSoup
from ddgs import DDGS # <-- Make sure this import is present
# --- Configuration ---
logging.basicConfig(level=logging.INFO, format='%(asctime)s - %(levelname)s - %(message)s')
logger = logging.getLogger(__name__)
load_dotenv()
LLM_API_KEY = os.getenv("LLM_API_KEY")
if not LLM_API_KEY:
raise RuntimeError("LLM_API_KEY must be set in a .env file.")
else:
logger.info("LLM API Key loaded successfully.")
# --- Constants & Headers ---
LLM_API_URL = "https://api.typegpt.net/v1/chat/completions"
LLM_MODEL = "meta-llama/Llama-4-Maverick-17B-128E-Instruct-FP8"
MAX_SOURCES_TO_PROCESS = 15
# Real Browser User Agents for SCRAPING
USER_AGENTS = [
"Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/128.0.0.0 Safari/537.36",
"Mozilla/5.0 (Macintosh; Intel Mac OS X 10_15_7) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/128.0.0.0 Safari/537.36",
"Mozilla/5.0 (Windows NT 10.0; Win64; x64; rv:129.0) Gecko/20100101 Firefox/129.0"
]
LLM_HEADERS = {"Authorization": f"Bearer {LLM_API_KEY}", "Content-Type": "application/json", "Accept": "application/json"}
class DeepResearchRequest(BaseModel):
query: str
app = FastAPI(
title="AI Deep Research API",
description="Provides robust, long-form, streaming deep research completions using the DuckDuckGo Search API.",
version="9.4.0" # Reverted to reliable DDGS library search
)
# Enable CORS for all origins
app.add_middleware(CORSMiddleware, allow_origins=["*"], allow_credentials=True, allow_methods=["*"], allow_headers=["*"])
# --- Helper Functions ---
def extract_json_from_llm_response(text: str) -> Optional[list]:
match = re.search(r'\[.*\]', text, re.DOTALL)
if match:
try:
return json.loads(match.group(0))
except json.JSONDecodeError:
return None
return None
# --- Core Service Functions ---
async def call_duckduckgo_search(session: aiohttp.ClientSession, query: str, max_results: int = 10) -> List[dict]:
"""
Performs a search using the DDGS library with an existing aiohttp session.
This method is more reliable than direct HTML scraping.
"""
logger.info(f"Searching DuckDuckGo API via DDGS for: '{query}'")
try:
ddgs = DDGS(session=session)
# Use ddgs.atext for asynchronous text search
raw_results = [r async for r in ddgs.atext(query, max_results=max_results)]
# Filter and format results to ensure they have the necessary keys
results = [
{'title': r.get('title'), 'link': r.get('href'), 'snippet': r.get('body')}
for r in raw_results if r.get('href') and r.get('title') and r.get('body')
]
logger.info(f"Found {len(results)} sources from DuckDuckGo for: '{query}'")
return results
except Exception as e:
logger.error(f"DDGS search failed for query '{query}': {e}", exc_info=True)
return []
async def research_and_process_source(session: aiohttp.ClientSession, source: dict) -> Tuple[str, dict]:
headers = {'User-Agent': random.choice(USER_AGENTS)}
try:
logger.info(f"Scraping: {source['link']}")
if source['link'].lower().endswith('.pdf'):
raise ValueError("PDF content")
async with session.get(source['link'], headers=headers, timeout=10, ssl=False) as response:
if response.status != 200:
raise ValueError(f"HTTP status {response.status}")
html = await response.text()
soup = BeautifulSoup(html, "html.parser")
for tag in soup(['script', 'style', 'nav', 'footer', 'header', 'aside']):
tag.decompose()
content = " ".join(soup.stripped_strings)
if not content.strip():
raise ValueError("Parsed content is empty.")
return content, source
except Exception as e:
logger.warning(f"Scraping failed for {source['link']} ({e}). Falling back to snippet.")
return source.get('snippet', ''), source
# --- Streaming Deep Research Logic ---
async def run_deep_research_stream(query: str) -> AsyncGenerator[str, None]:
def format_sse(data: dict) -> str:
return f"data: {json.dumps(data)}\n\n"
try:
async with aiohttp.ClientSession() as session:
yield format_sse({"event": "status", "data": "Generating research plan..."})
plan_prompt = {
"model": LLM_MODEL,
"messages": [{
"role": "user",
"content": f"Generate 3-4 key sub-questions for a research report on '{query}'. Your response MUST be ONLY the raw JSON array. Example: [\"Question 1?\"]"
}]
}
try:
async with session.post(LLM_API_URL, headers=LLM_HEADERS, json=plan_prompt, timeout=25) as response:
response.raise_for_status()
result = await response.json()
sub_questions = result if isinstance(result, list) else extract_json_from_llm_response(result['choices'][0]['message']['content'])
if not isinstance(sub_questions, list) or not sub_questions:
raise ValueError(f"Invalid or empty plan from LLM: {result}")
except Exception as e:
yield format_sse({"event": "error", "data": f"Could not generate research plan. Reason: {e}"})
return
yield format_sse({"event": "plan", "data": sub_questions})
yield format_sse({"event": "status", "data": f"Searching sources for {len(sub_questions)} topics..."})
search_tasks = [call_duckduckgo_search(session, sq) for sq in sub_questions]
all_search_results = await asyncio.gather(*search_tasks)
unique_sources = list({source['link']: source for results in all_search_results for source in results}.values())
if not unique_sources:
yield format_sse({"event": "error", "data": f"Could not find any relevant sources for the query '{query}'. Please try a different topic."})
return
sources_to_process = unique_sources[:MAX_SOURCES_TO_PROCESS]
yield format_sse({"event": "status", "data": f"Found {len(unique_sources)} unique sources. Processing the top {len(sources_to_process)}..."})
processing_tasks = [research_and_process_source(session, source) for source in sources_to_process]
consolidated_context, all_sources_used = "", []
for task in asyncio.as_completed(processing_tasks):
content, source_info = await task
if content and content.strip():
consolidated_context += f"Source: {source_info['link']}\nContent: {content}\n\n---\n\n"
all_sources_used.append(source_info)
if not consolidated_context.strip():
yield format_sse({"event": "error", "data": "Failed to scrape content from any of the discovered sources."})
return
yield format_sse({"event": "status", "data": "Synthesizing final report..."})
report_prompt = f'Synthesize the provided context into a long-form, comprehensive, multi-page report on "{query}". Use markdown. Elaborate extensively on each point. Base your entire report ONLY on the provided context.\n\n## Research Context ##\n{consolidated_context}'
report_payload = {"model": LLM_MODEL, "messages": [{"role": "user", "content": report_prompt}], "stream": True}
async with session.post(LLM_API_URL, headers=LLM_HEADERS, json=report_payload) as response:
response.raise_for_status()
async for line in response.content:
line_str = line.decode('utf-8').strip()
if line_str.startswith('data:'):
line_str = line_str[5:].strip()
if line_str == "[DONE]":
break
try:
chunk = json.loads(line_str)
choices = chunk.get("choices")
if choices and isinstance(choices, list) and len(choices) > 0:
content = choices[0].get("delta", {}).get("content")
if content:
yield format_sse({"event": "chunk", "data": content})
except json.JSONDecodeError:
continue
yield format_sse({"event": "sources", "data": all_sources_used})
except Exception as e:
logger.error(f"A critical error occurred: {e}", exc_info=True)
yield format_sse({"event": "error", "data": f"An unexpected error occurred: {str(e)}"})
@app.post("/deep-research", response_class=StreamingResponse)
async def deep_research_endpoint(request: DeepResearchRequest):
"""
Accepts a query and streams back a detailed research report.
Events: status, plan, chunk, sources, error
"""
return StreamingResponse(run_deep_research_stream(request.query), media_type="text/event-stream")
if __name__ == "__main__":
import uvicorn
# To run this app: uvicorn your_filename:app --reload
uvicorn.run(app, host="0.0.0.0", port=8000)