ClinicalTrialBasics / core /web_loader.py
essprasad's picture
Upload 10 files
e61e934 verified
raw
history blame
2.13 kB
import requests, re, json, time, os
from bs4 import BeautifulSoup
def web_crawler_loader(
urls_file="/home/user/app/data/urls.txt",
cache_path="/home/user/app/persistent/web_cache.json",
max_pages=3,
timeout=20,
force_refresh=False,
):
"""Fetch and cache text content from official URLs."""
cache = {}
if os.path.exists(cache_path) and not force_refresh:
try:
with open(cache_path, "r", encoding="utf-8") as f:
cache = json.load(f)
except Exception:
cache = {}
if not os.path.exists(urls_file):
print(f"⚠️ URLs file missing: {urls_file}")
return list(cache.values())
with open(urls_file, "r", encoding="utf-8") as f:
urls = [u.strip() for u in f if u.strip() and not u.startswith("#")]
new_entries = {}
for i, url in enumerate(urls[: max_pages * 10]):
if url in cache and not force_refresh:
new_entries[url] = cache[url]
continue
try:
print(f"🌐 Fetching ({i+1}/{len(urls)}): {url}")
r = requests.get(url, timeout=timeout, headers={"User-Agent": "ClinicalTrialChatBot/1.0"})
if r.status_code != 200:
print(f"⚠️ Skipped {url}: HTTP {r.status_code}")
continue
soup = BeautifulSoup(r.text, "html.parser")
for tag in soup(["script", "style", "header", "footer", "nav", "iframe"]):
tag.decompose()
text = " ".join(soup.get_text().split())
if len(text) < 400:
continue
entry_text = f"Source URL: {url}. {text[:3000]}"
new_entries[url] = {"source": url, "type": "Website", "text": entry_text}
time.sleep(1)
except Exception as e:
print(f"⚠️ Error fetching {url}: {e}")
cache.update(new_entries)
os.makedirs(os.path.dirname(cache_path), exist_ok=True)
with open(cache_path, "w", encoding="utf-8") as f:
json.dump(cache, f, indent=2)
print(f"💾 Web cache updated ({len(cache)} entries).")
return list(cache.values())