Spaces:
Sleeping
Sleeping
Update app.py
Browse files
app.py
CHANGED
|
@@ -8,6 +8,7 @@ import requests
|
|
| 8 |
import urllib.parse
|
| 9 |
import asyncio
|
| 10 |
import aiohttp
|
|
|
|
| 11 |
from typing import List
|
| 12 |
|
| 13 |
app = FastAPI()
|
|
@@ -213,6 +214,71 @@ async def web_search_and_extract(
|
|
| 213 |
except Exception as e:
|
| 214 |
raise HTTPException(status_code=500, detail=f"Error during search and extraction: {e}")
|
| 215 |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 216 |
@app.get("/api/adv_web_search")
|
| 217 |
async def adv_web_search(
|
| 218 |
q: str,
|
|
|
|
| 8 |
import urllib.parse
|
| 9 |
import asyncio
|
| 10 |
import aiohttp
|
| 11 |
+
import threading
|
| 12 |
from typing import List
|
| 13 |
|
| 14 |
app = FastAPI()
|
|
|
|
| 214 |
except Exception as e:
|
| 215 |
raise HTTPException(status_code=500, detail=f"Error during search and extraction: {e}")
|
| 216 |
|
| 217 |
+
def extract_text_from_webpage2(html_content):
|
| 218 |
+
"""Extracts visible text from HTML content using BeautifulSoup."""
|
| 219 |
+
soup = BeautifulSoup(html_content, "html.parser")
|
| 220 |
+
# Remove unwanted tags
|
| 221 |
+
for tag in soup(["script", "style", "header", "footer", "nav"]):
|
| 222 |
+
tag.extract()
|
| 223 |
+
# Get the remaining visible text
|
| 224 |
+
visible_text = soup.get_text(strip=True)
|
| 225 |
+
return visible_text
|
| 226 |
+
|
| 227 |
+
def fetch_and_extract2(url, max_chars):
|
| 228 |
+
"""Fetches a URL and extracts text using threading."""
|
| 229 |
+
try:
|
| 230 |
+
response = requests.get(url, headers={"User-Agent": "Mozilla/5.0"})
|
| 231 |
+
response.raise_for_status()
|
| 232 |
+
html_content = response.text
|
| 233 |
+
visible_text = extract_text_from_webpage2(html_content)
|
| 234 |
+
if len(visible_text) > max_chars:
|
| 235 |
+
visible_text = visible_text[:max_chars] + "..."
|
| 236 |
+
return {"link": url, "text": visible_text}
|
| 237 |
+
except (requests.exceptions.RequestException) as e:
|
| 238 |
+
print(f"Error fetching or processing {url}: {e}")
|
| 239 |
+
return {"link": url, "text": None}
|
| 240 |
+
|
| 241 |
+
@app.get("/api/websearch-and-extract-threading")
|
| 242 |
+
def web_search_and_extract_threading(
|
| 243 |
+
q: str,
|
| 244 |
+
max_results: int = 3,
|
| 245 |
+
timelimit: Optional[str] = None,
|
| 246 |
+
safesearch: str = "moderate",
|
| 247 |
+
region: str = "wt-wt",
|
| 248 |
+
backend: str = "html",
|
| 249 |
+
max_chars: int = 6000,
|
| 250 |
+
extract_only: bool = True
|
| 251 |
+
):
|
| 252 |
+
"""
|
| 253 |
+
Searches using WEBS, extracts text from the top results using threading, and returns both.
|
| 254 |
+
"""
|
| 255 |
+
try:
|
| 256 |
+
with WEBS() as webs:
|
| 257 |
+
# Perform WEBS search
|
| 258 |
+
search_results = webs.text(keywords=q, region=region, safesearch=safesearch,
|
| 259 |
+
timelimit=timelimit, backend=backend, max_results=max_results)
|
| 260 |
+
|
| 261 |
+
# Extract text from each result's link using threading
|
| 262 |
+
extracted_results = []
|
| 263 |
+
threads = []
|
| 264 |
+
for result in search_results:
|
| 265 |
+
if 'href' in result:
|
| 266 |
+
thread = threading.Thread(target=lambda: extracted_results.append(fetch_and_extract2(result['href'], max_chars)))
|
| 267 |
+
threads.append(thread)
|
| 268 |
+
thread.start()
|
| 269 |
+
|
| 270 |
+
# Wait for all threads to finish
|
| 271 |
+
for thread in threads:
|
| 272 |
+
thread.join()
|
| 273 |
+
|
| 274 |
+
if extract_only:
|
| 275 |
+
return JSONResponse(content=jsonable_encoder(extracted_results))
|
| 276 |
+
else:
|
| 277 |
+
return JSONResponse(content=jsonable_encoder({"search_results": search_results, "extracted_results": extracted_results}))
|
| 278 |
+
except Exception as e:
|
| 279 |
+
raise HTTPException(status_code=500, detail=f"Error during search and extraction: {e}")
|
| 280 |
+
|
| 281 |
+
|
| 282 |
@app.get("/api/adv_web_search")
|
| 283 |
async def adv_web_search(
|
| 284 |
q: str,
|