Spaces:

essprasad
/

ClinicalTrialBasics

Running

App Files Files Community

ClinicalTrialBasics / core /web_loader.py

essprasad

Upload 10 files

e61e934 verified 13 days ago

raw

history blame

2.13 kB

	import requests, re, json, time, os
	from bs4 import BeautifulSoup

	def web_crawler_loader(
	urls_file="/home/user/app/data/urls.txt",
	cache_path="/home/user/app/persistent/web_cache.json",
	max_pages=3,
	timeout=20,
	force_refresh=False,
	):
	"""Fetch and cache text content from official URLs."""
	cache = {}
	if os.path.exists(cache_path) and not force_refresh:
	try:
	with open(cache_path, "r", encoding="utf-8") as f:
	cache = json.load(f)
	except Exception:
	cache = {}

	if not os.path.exists(urls_file):
	print(f"⚠️ URLs file missing: {urls_file}")
	return list(cache.values())

	with open(urls_file, "r", encoding="utf-8") as f:
	urls = [u.strip() for u in f if u.strip() and not u.startswith("#")]

	new_entries = {}
	for i, url in enumerate(urls[: max_pages * 10]):
	if url in cache and not force_refresh:
	new_entries[url] = cache[url]
	continue
	try:
	print(f"🌐 Fetching ({i+1}/{len(urls)}): {url}")
	r = requests.get(url, timeout=timeout, headers={"User-Agent": "ClinicalTrialChatBot/1.0"})
	if r.status_code != 200:
	print(f"⚠️ Skipped {url}: HTTP {r.status_code}")
	continue
	soup = BeautifulSoup(r.text, "html.parser")
	for tag in soup(["script", "style", "header", "footer", "nav", "iframe"]):
	tag.decompose()
	text = " ".join(soup.get_text().split())
	if len(text) < 400:
	continue
	entry_text = f"Source URL: {url}. {text[:3000]}"
	new_entries[url] = {"source": url, "type": "Website", "text": entry_text}
	time.sleep(1)
	except Exception as e:
	print(f"⚠️ Error fetching {url}: {e}")

	cache.update(new_entries)
	os.makedirs(os.path.dirname(cache_path), exist_ok=True)
	with open(cache_path, "w", encoding="utf-8") as f:
	json.dump(cache, f, indent=2)
	print(f"💾 Web cache updated ({len(cache)} entries).")
	return list(cache.values())