Spaces:

essprasad
/

ClinicalTrialBasics

Running

File size: 2,129 Bytes

e61e934

import requests, re, json, time, os
from bs4 import BeautifulSoup

def web_crawler_loader(
    urls_file="/home/user/app/data/urls.txt",
    cache_path="/home/user/app/persistent/web_cache.json",
    max_pages=3,
    timeout=20,
    force_refresh=False,
):
    """Fetch and cache text content from official URLs."""
    cache = {}
    if os.path.exists(cache_path) and not force_refresh:
        try:
            with open(cache_path, "r", encoding="utf-8") as f:
                cache = json.load(f)
        except Exception:
            cache = {}

    if not os.path.exists(urls_file):
        print(f"⚠️ URLs file missing: {urls_file}")
        return list(cache.values())

    with open(urls_file, "r", encoding="utf-8") as f:
        urls = [u.strip() for u in f if u.strip() and not u.startswith("#")]

    new_entries = {}
    for i, url in enumerate(urls[: max_pages * 10]):
        if url in cache and not force_refresh:
            new_entries[url] = cache[url]
            continue
        try:
            print(f"🌐 Fetching ({i+1}/{len(urls)}): {url}")
            r = requests.get(url, timeout=timeout, headers={"User-Agent": "ClinicalTrialChatBot/1.0"})
            if r.status_code != 200:
                print(f"⚠️ Skipped {url}: HTTP {r.status_code}")
                continue
            soup = BeautifulSoup(r.text, "html.parser")
            for tag in soup(["script", "style", "header", "footer", "nav", "iframe"]):
                tag.decompose()
            text = " ".join(soup.get_text().split())
            if len(text) < 400:
                continue
            entry_text = f"Source URL: {url}. {text[:3000]}"
            new_entries[url] = {"source": url, "type": "Website", "text": entry_text}
            time.sleep(1)
        except Exception as e:
            print(f"⚠️ Error fetching {url}: {e}")

    cache.update(new_entries)
    os.makedirs(os.path.dirname(cache_path), exist_ok=True)
    with open(cache_path, "w", encoding="utf-8") as f:
        json.dump(cache, f, indent=2)
    print(f"💾 Web cache updated ({len(cache)} entries).")
    return list(cache.values())