import requests, re, json, time, os from bs4 import BeautifulSoup def web_crawler_loader( urls_file="/home/user/app/data/urls.txt", cache_path="/home/user/app/persistent/web_cache.json", max_pages=3, timeout=20, force_refresh=False, ): """Fetch and cache text content from official URLs.""" cache = {} if os.path.exists(cache_path) and not force_refresh: try: with open(cache_path, "r", encoding="utf-8") as f: cache = json.load(f) except Exception: cache = {} if not os.path.exists(urls_file): print(f"⚠️ URLs file missing: {urls_file}") return list(cache.values()) with open(urls_file, "r", encoding="utf-8") as f: urls = [u.strip() for u in f if u.strip() and not u.startswith("#")] new_entries = {} for i, url in enumerate(urls[: max_pages * 10]): if url in cache and not force_refresh: new_entries[url] = cache[url] continue try: print(f"🌐 Fetching ({i+1}/{len(urls)}): {url}") r = requests.get(url, timeout=timeout, headers={"User-Agent": "ClinicalTrialChatBot/1.0"}) if r.status_code != 200: print(f"⚠️ Skipped {url}: HTTP {r.status_code}") continue soup = BeautifulSoup(r.text, "html.parser") for tag in soup(["script", "style", "header", "footer", "nav", "iframe"]): tag.decompose() text = " ".join(soup.get_text().split()) if len(text) < 400: continue entry_text = f"Source URL: {url}. {text[:3000]}" new_entries[url] = {"source": url, "type": "Website", "text": entry_text} time.sleep(1) except Exception as e: print(f"⚠️ Error fetching {url}: {e}") cache.update(new_entries) os.makedirs(os.path.dirname(cache_path), exist_ok=True) with open(cache_path, "w", encoding="utf-8") as f: json.dump(cache, f, indent=2) print(f"💾 Web cache updated ({len(cache)} entries).") return list(cache.values())