Webscout-TTI

Sleeping

App Files Files Community

KingNish commited on Jul 3, 2024

Commit

b61ee91

verified ·

1 Parent(s): 2b2b242

Update app.py

Browse files

Files changed (1) hide show

app.py +66 -0

app.py CHANGED Viewed

@@ -8,6 +8,7 @@ import requests
 import urllib.parse
 import asyncio
 import aiohttp
 from typing import List
 app = FastAPI()
@@ -213,6 +214,71 @@ async def web_search_and_extract(
     except Exception as e:
         raise HTTPException(status_code=500, detail=f"Error during search and extraction: {e}")
 @app.get("/api/adv_web_search")
 async def adv_web_search(
     q: str,

 import urllib.parse
 import asyncio
 import aiohttp
+import threading
 from typing import List
 app = FastAPI()
     except Exception as e:
         raise HTTPException(status_code=500, detail=f"Error during search and extraction: {e}")
+def extract_text_from_webpage2(html_content):
+    """Extracts visible text from HTML content using BeautifulSoup."""
+    soup = BeautifulSoup(html_content, "html.parser")
+    # Remove unwanted tags
+    for tag in soup(["script", "style", "header", "footer", "nav"]):
+        tag.extract()
+    # Get the remaining visible text
+    visible_text = soup.get_text(strip=True)
+    return visible_text
+def fetch_and_extract2(url, max_chars):
+    """Fetches a URL and extracts text using threading."""
+    try:
+        response = requests.get(url, headers={"User-Agent": "Mozilla/5.0"})
+        response.raise_for_status()
+        html_content = response.text
+        visible_text = extract_text_from_webpage2(html_content)
+        if len(visible_text) > max_chars:
+            visible_text = visible_text[:max_chars] + "..."
+        return {"link": url, "text": visible_text}
+    except (requests.exceptions.RequestException) as e:
+        print(f"Error fetching or processing {url}: {e}")
+        return {"link": url, "text": None}
+@app.get("/api/websearch-and-extract-threading")
+def web_search_and_extract_threading(
+    q: str,
+    max_results: int = 3,
+    timelimit: Optional[str] = None,
+    safesearch: str = "moderate",
+    region: str = "wt-wt",
+    backend: str = "html",
+    max_chars: int = 6000,
+    extract_only: bool = True
+):
+    """
+    Searches using WEBS, extracts text from the top results using threading, and returns both.
+    """
+    try:
+        with WEBS() as webs:
+            # Perform WEBS search
+            search_results = webs.text(keywords=q, region=region, safesearch=safesearch,
+                                     timelimit=timelimit, backend=backend, max_results=max_results)
+            # Extract text from each result's link using threading
+            extracted_results = []
+            threads = []
+            for result in search_results:
+                if 'href' in result:
+                    thread = threading.Thread(target=lambda: extracted_results.append(fetch_and_extract2(result['href'], max_chars)))
+                    threads.append(thread)
+                    thread.start()
+            # Wait for all threads to finish
+            for thread in threads:
+                thread.join()
+            if extract_only:
+                return JSONResponse(content=jsonable_encoder(extracted_results))
+            else:
+                return JSONResponse(content=jsonable_encoder({"search_results": search_results, "extracted_results": extracted_results}))
+    except Exception as e:
+        raise HTTPException(status_code=500, detail=f"Error during search and extraction: {e}")
 @app.get("/api/adv_web_search")
 async def adv_web_search(
     q: str,