Spaces:
Running
Running
| import requests, re, json, time, os | |
| from bs4 import BeautifulSoup | |
| def web_crawler_loader( | |
| urls_file="/home/user/app/data/urls.txt", | |
| cache_path="/home/user/app/persistent/web_cache.json", | |
| max_pages=3, | |
| timeout=20, | |
| force_refresh=False, | |
| ): | |
| """Fetch and cache text content from official URLs.""" | |
| cache = {} | |
| if os.path.exists(cache_path) and not force_refresh: | |
| try: | |
| with open(cache_path, "r", encoding="utf-8") as f: | |
| cache = json.load(f) | |
| except Exception: | |
| cache = {} | |
| if not os.path.exists(urls_file): | |
| print(f"⚠️ URLs file missing: {urls_file}") | |
| return list(cache.values()) | |
| with open(urls_file, "r", encoding="utf-8") as f: | |
| urls = [u.strip() for u in f if u.strip() and not u.startswith("#")] | |
| new_entries = {} | |
| for i, url in enumerate(urls[: max_pages * 10]): | |
| if url in cache and not force_refresh: | |
| new_entries[url] = cache[url] | |
| continue | |
| try: | |
| print(f"🌐 Fetching ({i+1}/{len(urls)}): {url}") | |
| r = requests.get(url, timeout=timeout, headers={"User-Agent": "ClinicalTrialChatBot/1.0"}) | |
| if r.status_code != 200: | |
| print(f"⚠️ Skipped {url}: HTTP {r.status_code}") | |
| continue | |
| soup = BeautifulSoup(r.text, "html.parser") | |
| for tag in soup(["script", "style", "header", "footer", "nav", "iframe"]): | |
| tag.decompose() | |
| text = " ".join(soup.get_text().split()) | |
| if len(text) < 400: | |
| continue | |
| entry_text = f"Source URL: {url}. {text[:3000]}" | |
| new_entries[url] = {"source": url, "type": "Website", "text": entry_text} | |
| time.sleep(1) | |
| except Exception as e: | |
| print(f"⚠️ Error fetching {url}: {e}") | |
| cache.update(new_entries) | |
| os.makedirs(os.path.dirname(cache_path), exist_ok=True) | |
| with open(cache_path, "w", encoding="utf-8") as f: | |
| json.dump(cache, f, indent=2) | |
| print(f"💾 Web cache updated ({len(cache)} entries).") | |
| return list(cache.values()) | |