essprasad commited on
Commit
f9053c5
·
verified ·
1 Parent(s): b816136

Upload 11 files

Browse files
Files changed (11) hide show
  1. README.md +47 -7
  2. app.py +394 -0
  3. cleanup_space.py +135 -0
  4. gitattributes +49 -0
  5. gitignore +71 -0
  6. gitignore (1) +71 -0
  7. lfsconfig +4 -0
  8. postBuild +60 -0
  9. requirements.txt +43 -0
  10. runtime.txt +1 -0
  11. runtime.yaml +26 -0
README.md CHANGED
@@ -1,13 +1,53 @@
1
  ---
2
- title: ClinicalTrialBasics
3
- emoji: 📉
4
- colorFrom: gray
5
- colorTo: yellow
6
  sdk: gradio
7
- sdk_version: 5.49.1
8
  app_file: app.py
9
  pinned: false
10
- short_description: 'Gives answers from trusted, credible and authentic sources '
11
  ---
12
 
13
- Check out the configuration reference at https://huggingface.co/docs/hub/spaces-config-reference
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
  ---
2
+ title: Clinical Research Chatbot
3
+ emoji: 🧪
4
+ colorFrom: blue
5
+ colorTo: green
6
  sdk: gradio
7
+ sdk_version: 5.49.0
8
  app_file: app.py
9
  pinned: false
 
10
  ---
11
 
12
+ # 🧪 Clinical Research Chatbot
13
+
14
+ A lightweight, fully open-source chatbot for clinical research professionals.
15
+ Runs entirely on Hugging Face — no OpenAI dependency.
16
+
17
+ ---
18
+
19
+ ## ✅ Current Features
20
+
21
+ ### 💬 Chatbot Interface
22
+ - Gradio UI with chatbot + Admin Tools tab.
23
+ - Query pipeline: **FAQ → Glossary → Knowledge Base → APIs (PubMed → FDA → ClinicalTrials.gov)**.
24
+ - Answers are clearly labeled by source.
25
+
26
+ ### 🔍 Knowledge Base (Docs + URLs)
27
+ - Supports ingestion of: PDF, DOCX, TXT, XLSX, JSON, HTML.
28
+ - Auto-ingests from:
29
+ - `/data/public_docs/`
30
+ - `/data/urls.txt`
31
+ - Smart chunking optimized for glossary terms + long text.
32
+
33
+ ### 📦 Vector Search
34
+ - FAISS + `all-MiniLM-L6-v2` embeddings.
35
+ - Persistent storage:
36
+ - `/persistent/faiss.index`
37
+ - `/persistent/faiss.index.meta.json`
38
+ - Index survives restarts and can be exported/imported as `.zip`.
39
+
40
+ ### 🌐 API Integrations
41
+ - PubMed
42
+ - FDA Drug Labels
43
+ - ClinicalTrials.gov
44
+
45
+ ### 🧠 Query Handling
46
+ - Glossary-aware normalization
47
+ *(e.g., eCRF, e-CRF, electronic case report form → same match)*
48
+ - Glossary priority: if glossary hit exists → always returned first.
49
+ - Answer flow: **FAQ → Glossary → KB → APIs**.
50
+ - Clear section labels, citations, and confidence notes.
51
+
52
+ ### 📜 Logging
53
+ All queries, answers, and sources saved in:
app.py ADDED
@@ -0,0 +1,394 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ # ==========================================================
2
+ # SAFE-MODE PRELAUNCH CLEANUP (runs before any heavy imports)
3
+ # ==========================================================
4
+ import os, shutil, time, glob
5
+
6
+ def _prelaunch_cleanup(threshold_gb=45.0):
7
+ """Early cleanup to prevent Hugging Face Space eviction (50 GB limit)."""
8
+ def _used_gb(path="/home/user/app"):
9
+ try:
10
+ total, used, free = shutil.disk_usage(path)
11
+ used_gb = max(0.0, min(used / (1024**3), 49.9))
12
+ return used_gb
13
+ except Exception:
14
+ return 0.0
15
+
16
+ used = _used_gb()
17
+ print(f"\n💾 Startup disk usage: {used:.2f} GB")
18
+
19
+ cache_paths = [
20
+ os.path.expanduser("~/.cache/huggingface"),
21
+ os.path.expanduser("~/.cache/hfhub"),
22
+ "/home/user/.cache/huggingface",
23
+ "/home/user/.cache",
24
+ "/home/user/app/__pycache__",
25
+ "/home/user/app/data/__pycache__",
26
+ ]
27
+ for p in cache_paths:
28
+ if os.path.exists(p):
29
+ shutil.rmtree(p, ignore_errors=True)
30
+
31
+ if used > threshold_gb:
32
+ print(f"⚠️ Usage {used:.2f} GB > {threshold_gb} GB — performing aggressive cleanup.")
33
+ preserve = {"faiss.index", "faiss.index.meta.json", "glossary.json"}
34
+ folders = ["/home/user/app/data/docs_cache", "/home/user/app/tmp_docs", "/home/user/app/persistent"]
35
+ for folder in folders:
36
+ if os.path.exists(folder):
37
+ for f in glob.glob(os.path.join(folder, "*")):
38
+ if os.path.basename(f) in preserve:
39
+ continue
40
+ try:
41
+ if os.path.isfile(f):
42
+ os.remove(f)
43
+ else:
44
+ shutil.rmtree(f, ignore_errors=True)
45
+ except Exception:
46
+ pass
47
+ print("🧹 Aggressive cleanup complete.")
48
+
49
+ print(f"✨ Disk after cleanup: {_used_gb():.2f} GB\n")
50
+ shutil.rmtree("/home/user/app/runtime_faiss", ignore_errors=True)
51
+
52
+ _prelaunch_cleanup()
53
+
54
+ # ==========================================================
55
+ # MAIN APP — Clinical Trial Chatbot
56
+ # ==========================================================
57
+ import gradio as gr
58
+ import pandas as pd
59
+ import json, faiss, numpy as np, shutil
60
+ from sentence_transformers import SentenceTransformer
61
+ from core.hybrid_retriever import summarize_combined
62
+ from core import vector_store, vector_sync
63
+
64
+ APP_TITLE = "🧠 Clinical Research Chatbot"
65
+ APP_DESC = (
66
+ "Ask any clinical research or GCP-related question. "
67
+ "Retrieves and summarizes from ICH, GCDMP, EMA, FDA, Excel, and Web datasets."
68
+ )
69
+
70
+ DATA_PATHS = [
71
+ "/home/user/app/persistent/faiss.index",
72
+ "/home/user/app/persistent/faiss.index.meta.json",
73
+ "/home/user/app/data/docs_cache",
74
+ ]
75
+
76
+ # ----------------------------------------------------------
77
+ # CLEAR INDEX / CACHE
78
+ # ----------------------------------------------------------
79
+ def clear_index():
80
+ removed = []
81
+ for p in DATA_PATHS:
82
+ if os.path.isdir(p):
83
+ shutil.rmtree(p, ignore_errors=True)
84
+ removed.append(f"🗑️ Deleted folder: {p}")
85
+ elif os.path.exists(p):
86
+ os.remove(p)
87
+ removed.append(f"🗑️ Deleted file: {p}")
88
+ msg = "\n".join(removed) if removed else "ℹ️ No cache files found."
89
+ print(msg)
90
+ return msg
91
+
92
+ # ----------------------------------------------------------
93
+ # EMBEDDER HELPER
94
+ # ----------------------------------------------------------
95
+ def _load_embedder():
96
+ print("📦 Loading embedding model: sentence-transformers/all-MiniLM-L6-v2 ...")
97
+ model = SentenceTransformer("sentence-transformers/all-MiniLM-L6-v2")
98
+ print("✅ Model loaded.")
99
+ return model
100
+
101
+ # ----------------------------------------------------------
102
+ # WEB CRAWLER with LOCAL CACHE (Optimized & Safe)
103
+ # ----------------------------------------------------------
104
+ def web_crawler_loader(
105
+ urls_file="/home/user/app/data/urls.txt",
106
+ cache_path="/home/user/app/persistent/web_cache.json",
107
+ max_pages=3,
108
+ timeout=20,
109
+ force_refresh=False,
110
+ ):
111
+ """
112
+ Loads readable text content from URLs listed in urls.txt.
113
+ Uses a local cache (web_cache.json) to skip re-downloading.
114
+ Returns list of dicts: [{ 'source': URL, 'type': 'Website', 'text': text }]
115
+ """
116
+ import requests, re, time, json
117
+ from bs4 import BeautifulSoup
118
+
119
+ # --- Load existing cache (if any) ---
120
+ cache = {}
121
+ if os.path.exists(cache_path) and not force_refresh:
122
+ try:
123
+ with open(cache_path, "r", encoding="utf-8") as f:
124
+ cache = json.load(f)
125
+ print(f"🗂️ Loaded cached web content ({len(cache)} entries).")
126
+ except Exception as e:
127
+ print(f"⚠️ Cache read error ({e}) — starting fresh.")
128
+ cache = {}
129
+
130
+ # --- Validate URL list ---
131
+ if not os.path.exists(urls_file):
132
+ print(f"⚠️ URLs file not found: {urls_file}")
133
+ return list(cache.values())
134
+
135
+ with open(urls_file, "r", encoding="utf-8") as f:
136
+ urls = [u.strip() for u in f if u.strip() and not u.startswith("#")]
137
+
138
+ print(f"🌐 Found {len(urls)} URLs in {urls_file}")
139
+ new_entries = {}
140
+
141
+ for i, url in enumerate(urls[: max_pages * 10]):
142
+ if url in cache and not force_refresh:
143
+ print(f"♻️ Using cached content for {url}")
144
+ new_entries[url] = cache[url]
145
+ continue
146
+
147
+ try:
148
+ print(f"🌐 Fetching ({i+1}/{len(urls)}): {url}")
149
+ resp = requests.get(
150
+ url,
151
+ timeout=timeout,
152
+ headers={"User-Agent": "ClinicalTrialChatBot/1.0 (+https://huggingface.co/essprasad)"}
153
+ )
154
+
155
+ if resp.status_code != 200:
156
+ print(f"⚠️ Skipped {url}: HTTP {resp.status_code}")
157
+ continue
158
+
159
+ soup = BeautifulSoup(resp.text, "html.parser")
160
+
161
+ # Remove unwanted elements
162
+ for tag in soup(["script", "style", "nav", "header", "footer", "noscript", "iframe"]):
163
+ tag.decompose()
164
+
165
+ # Extract visible text
166
+ text = " ".join(t.strip() for t in soup.get_text().split())
167
+ text = re.sub(r"\s+", " ", text).strip()
168
+
169
+ if len(text) < 500:
170
+ print(f"⚠️ Skipped {url}: too little readable text ({len(text)} chars).")
171
+ continue
172
+
173
+ # Keep first 3000 chars to reduce vector size
174
+ entry_text = f"Source URL: {url}. {text[:3000]}"
175
+ new_entries[url] = {"source": url, "type": "Website", "text": entry_text}
176
+ print(f"✅ Cached: {url}")
177
+
178
+ time.sleep(1) # polite delay
179
+
180
+ except Exception as e:
181
+ print(f"⚠️ Failed to fetch {url}: {e}")
182
+
183
+ # --- Merge & Save updated cache ---
184
+ if new_entries:
185
+ cache.update(new_entries)
186
+ try:
187
+ os.makedirs(os.path.dirname(cache_path), exist_ok=True)
188
+ with open(cache_path, "w", encoding="utf-8") as f:
189
+ json.dump(cache, f, indent=2)
190
+ print(f"💾 Web cache updated ({len(cache)} total URLs).")
191
+ except Exception as e:
192
+ print(f"⚠️ Failed to write cache: {e}")
193
+
194
+ return list(cache.values())
195
+
196
+
197
+ def rebuild_index():
198
+ """Fully rebuild FAISS index using glossary + Excel + web sources (fresh start)."""
199
+ print("🧠 Rebuilding FAISS index (Glossary + Excel + Web)...")
200
+
201
+ import os, json, re, shutil, pandas as pd, faiss, numpy as np
202
+ from huggingface_hub import hf_hub_download, list_repo_files
203
+ from core.vector_sync import rebuild_faiss_from_glossary, _upload_to_dataset
204
+ from sentence_transformers import SentenceTransformer
205
+
206
+ repo_id_index = "essprasad/CT-Chat-Index"
207
+ repo_id_docs = "essprasad/CT-Chat-Docs"
208
+ local_dir = "/home/user/app/persistent"
209
+ os.makedirs(local_dir, exist_ok=True)
210
+
211
+ # --- STEP 0: CLEAN OLD INDEX ---
212
+ for old_file in ["faiss.index", "faiss.index.meta.json"]:
213
+ old_path = os.path.join(local_dir, old_file)
214
+ if os.path.exists(old_path):
215
+ os.remove(old_path)
216
+ print(f"🗑️ Removed old FAISS artifact: {old_path}")
217
+
218
+ # --- STEP 1: LOAD GLOSSARY BASE ---
219
+ glossary_path = os.path.join(local_dir, "glossary.json")
220
+ if not os.path.exists(glossary_path):
221
+ print(f"📥 Downloading glossary.json from {repo_id_index}...")
222
+ downloaded_path = hf_hub_download(
223
+ repo_id=repo_id_index,
224
+ filename="persistent/glossary.json",
225
+ repo_type="dataset",
226
+ force_download=True,
227
+ )
228
+ shutil.copy2(downloaded_path, glossary_path)
229
+ print(f"✅ glossary.json copied to {glossary_path}")
230
+
231
+ index, metas = rebuild_faiss_from_glossary(glossary_path=glossary_path)
232
+ print(f"📘 Loaded {len(metas)} glossary entries.")
233
+
234
+ # --- STEP 2: INDEX EXCEL FILES ---
235
+ print("📑 Scanning Excel files...")
236
+ repo_files = list_repo_files(repo_id_docs, repo_type="dataset")
237
+ excel_files = [f for f in repo_files if f.lower().endswith((".xlsx", ".xls"))]
238
+
239
+ model = SentenceTransformer("sentence-transformers/all-MiniLM-L6-v2")
240
+ excel_entries = []
241
+
242
+ for file_name in excel_files:
243
+ print(f"📄 Processing Excel: {file_name}")
244
+ path = hf_hub_download(repo_id_docs, filename=file_name, repo_type="dataset")
245
+ xls = pd.read_excel(path, sheet_name=None)
246
+
247
+ for sheet_name, df in xls.items():
248
+ df = df.fillna("").dropna(how="all")
249
+ df.columns = [str(c).strip().lower() for c in df.columns]
250
+
251
+ term_col = next((c for c in df.columns if "term" in c or "word" in c), None)
252
+ if not term_col:
253
+ print(f"⚠️ No 'term' column in {file_name}:{sheet_name}")
254
+ continue
255
+
256
+ for _, row in df.iterrows():
257
+ term = str(row.get(term_col, "")).strip()
258
+ if not term:
259
+ continue
260
+
261
+ # Combine all columns with values
262
+ parts = [
263
+ f"{c.capitalize()}: {str(row[c]).strip()}"
264
+ for c in df.columns if str(row[c]).strip()
265
+ ]
266
+ joined = " ".join(parts)
267
+ if len(joined) < 80: # Skip tiny entries
268
+ continue
269
+
270
+ entry_text = f"Definition of {term}: {joined}"
271
+ excel_entries.append({
272
+ "source": file_name,
273
+ "sheet": sheet_name,
274
+ "term": term,
275
+ "type": "Excel",
276
+ "file": file_name,
277
+ "text": entry_text,
278
+ })
279
+
280
+ if excel_entries:
281
+ print(f"✅ Loaded {len(excel_entries)} Excel rows.")
282
+ texts = [e["text"] for e in excel_entries]
283
+ embeddings = model.encode(texts, show_progress_bar=True, convert_to_numpy=True).astype("float32")
284
+ faiss.normalize_L2(embeddings)
285
+ index.add(embeddings)
286
+ metas.extend(excel_entries)
287
+ print("✅ Excel content added to FAISS.")
288
+
289
+ # --- STEP 3: WEB CONTENT ---
290
+ try:
291
+ print("🌐 Loading and embedding web content...")
292
+ web_entries = web_crawler_loader(
293
+ urls_file="/home/user/app/data/urls.txt",
294
+ cache_path="/home/user/app/persistent/web_cache.json",
295
+ max_pages=3,
296
+ timeout=20,
297
+ force_refresh=False,
298
+ )
299
+ if web_entries:
300
+ web_entries = [e for e in web_entries if len(e.get("text", "")) > 200]
301
+ print(f"✅ Retrieved {len(web_entries)} web entries.")
302
+ web_texts = [e["text"] for e in web_entries]
303
+ web_emb = model.encode(web_texts, show_progress_bar=True, convert_to_numpy=True).astype("float32")
304
+ faiss.normalize_L2(web_emb)
305
+ index.add(web_emb)
306
+ metas.extend(web_entries)
307
+ print("✅ Web content added to FAISS.")
308
+ else:
309
+ print("⚠️ No web entries found.")
310
+ except Exception as e:
311
+ print(f"⚠️ Web content embedding failed: {e}")
312
+
313
+ # --- STEP 4: SAVE & UPLOAD ---
314
+ faiss_path = os.path.join(local_dir, "faiss.index")
315
+ meta_path = os.path.join(local_dir, "faiss.index.meta.json")
316
+ faiss.write_index(index, faiss_path)
317
+ with open(meta_path, "w", encoding="utf-8") as f:
318
+ json.dump(metas, f, indent=2)
319
+ print(f"💾 Local FAISS index saved ({len(metas)} entries).")
320
+
321
+ try:
322
+ _upload_to_dataset(faiss_path, meta_path, repo_id_index)
323
+ print(f"☁️ Uploaded latest FAISS index ({len(metas)} entries) to {repo_id_index}.")
324
+ except Exception as e:
325
+ print(f"⚠️ Upload to Hugging Face failed: {e}")
326
+
327
+ print("✅ Glossary + Excel + Web FAISS rebuilt successfully.")
328
+ return f"✅ Rebuild complete: {len(metas)} entries (including Excel + Web)."
329
+
330
+ # ----------------------------------------------------------
331
+ # 4. REBUILD GLOSSARY
332
+ # ----------------------------------------------------------
333
+ def rebuild_glossary():
334
+ try:
335
+ from core.glossary_builder import rebuild_and_upload
336
+ rebuild_and_upload()
337
+ return "✅ Glossary rebuilt and uploaded successfully."
338
+ except Exception as e:
339
+ return f"⚠️ Glossary rebuild failed: {e}"
340
+
341
+ # ----------------------------------------------------------
342
+ # 5. CHATBOT LOGIC
343
+ # ----------------------------------------------------------
344
+ def chat_answer(query, mode):
345
+ try:
346
+ query_clean = query.strip()
347
+ if not query_clean:
348
+ return "<i>⚠️ Please enter a valid query.</i>"
349
+
350
+ from core.hybrid_retriever import summarize_combined
351
+ return summarize_combined(query_clean, mode=mode)
352
+ except Exception as e:
353
+ print("❌ Chatbot error:", e)
354
+ return f"<i>⚠️ Error: {e}</i>"
355
+
356
+ # ----------------------------------------------------------
357
+ # 6. GRADIO UI (Simplified + Keyboard Support)
358
+ # ----------------------------------------------------------
359
+ with gr.Blocks(theme="gradio/soft") as demo:
360
+ gr.Markdown(f"# {APP_TITLE}")
361
+ gr.Markdown(APP_DESC)
362
+
363
+ # 🔹 Main input + output areas
364
+ query_box = gr.Textbox(
365
+ label="Ask your clinical trial question",
366
+ placeholder="e.g. What is an eCRF?",
367
+ lines=2,
368
+ show_label=True
369
+ )
370
+ output_box = gr.HTML(label="Answer")
371
+
372
+ # 🔹 Control buttons row
373
+ with gr.Row():
374
+ submit_btn = gr.Button("🚀 Submit", variant="primary")
375
+ rebuild_btn = gr.Button("🔁 Rebuild Index")
376
+ rebuild_glossary_btn = gr.Button("📘 Rebuild Glossary")
377
+ clear_btn = gr.Button("🧹 Clear Cache / Index")
378
+
379
+ # 🔹 Event bindings
380
+ submit_btn.click(fn=chat_answer, inputs=[query_box], outputs=output_box)
381
+ query_box.submit(fn=chat_answer, inputs=[query_box], outputs=output_box) # ↵ Press Enter = Submit
382
+
383
+ rebuild_btn.click(fn=rebuild_index, outputs=output_box)
384
+ rebuild_glossary_btn.click(fn=rebuild_glossary, outputs=output_box)
385
+ clear_btn.click(fn=clear_index, outputs=output_box)
386
+
387
+ # ----------------------------------------------------------
388
+ # 7. LAUNCH APP
389
+ # ----------------------------------------------------------
390
+ if __name__ == "__main__":
391
+ print("🚀 Starting Clinical Trial Chatbot...")
392
+ print("🧠 Initializing retriever warm-up...")
393
+ demo.launch(server_name="0.0.0.0", server_port=7860, share=False)
394
+
cleanup_space.py ADDED
@@ -0,0 +1,135 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ """
2
+ cleanup_space.py
3
+ ----------------
4
+ Maintenance script for Hugging Face Space cleanup.
5
+ - Removes caches, temp files, and large unneeded assets.
6
+ - Keeps only FAISS index + metadata + glossary.
7
+ - Reuploads them to CT-Chat-Index dataset.
8
+ """
9
+ import subprocess
10
+ subprocess.run(["python", "cleanup_space.py"], check=False)
11
+ import os
12
+ import shutil
13
+ import time
14
+ from datetime import datetime
15
+ from huggingface_hub import HfApi, upload_file, HfFolder
16
+
17
+ # 🔧 Configuration
18
+ REPO_ID = "essprasad/CT-Chat-Index" # Dataset repo
19
+ REPO_TYPE = "dataset"
20
+ PERSISTENT_DIR = "persistent"
21
+ DATA_DIR = "data"
22
+ KEEP_FILES = [
23
+ "persistent/faiss.index",
24
+ "persistent/faiss.index.meta.json",
25
+ "data/glossary.json"
26
+ ]
27
+
28
+ api = HfApi()
29
+ token = HfFolder.get_token() or os.getenv("HF_TOKEN", None)
30
+
31
+ def readable_size(path):
32
+ """Return human-readable folder size."""
33
+ total = 0
34
+ for dirpath, _, filenames in os.walk(path):
35
+ for f in filenames:
36
+ fp = os.path.join(dirpath, f)
37
+ if os.path.exists(fp):
38
+ total += os.path.getsize(fp)
39
+ for unit in ["B", "KB", "MB", "GB"]:
40
+ if total < 1024.0:
41
+ return f"{total:.2f} {unit}"
42
+ total /= 1024.0
43
+ return f"{total:.2f} TB"
44
+
45
+ # --------------------------------------------------------------------
46
+ # 1. Clean caches, logs, temp files
47
+ # --------------------------------------------------------------------
48
+ def clean_temp_and_cache():
49
+ print("🧹 Cleaning temporary and cache directories...")
50
+ for path in ["/root/.cache", "/home/user/.cache", "/tmp"]:
51
+ shutil.rmtree(path, ignore_errors=True)
52
+ os.makedirs(path, exist_ok=True)
53
+
54
+ # Remove logs larger than 5 MB
55
+ log_dir = "logs"
56
+ if os.path.exists(log_dir):
57
+ for f in os.listdir(log_dir):
58
+ fp = os.path.join(log_dir, f)
59
+ if os.path.isfile(fp) and os.path.getsize(fp) > 5 * 1024 * 1024:
60
+ os.remove(fp)
61
+ print(f"🗑️ Removed oversized log: {fp}")
62
+
63
+ # --------------------------------------------------------------------
64
+ # 2. Remove large documents & orphan files
65
+ # --------------------------------------------------------------------
66
+ def trim_data():
67
+ print("📦 Trimming large files from data/public_docs...")
68
+ doc_dir = os.path.join(DATA_DIR, "public_docs")
69
+ if not os.path.exists(doc_dir):
70
+ return
71
+
72
+ for root, _, files in os.walk(doc_dir):
73
+ for f in files:
74
+ fp = os.path.join(root, f)
75
+ if os.path.getsize(fp) > 10 * 1024 * 1024: # >10MB
76
+ print(f"🗑️ Removing large doc: {fp}")
77
+ os.remove(fp)
78
+
79
+ # --------------------------------------------------------------------
80
+ # 3. Verify and keep only essential files
81
+ # --------------------------------------------------------------------
82
+ def preserve_key_files():
83
+ print("🔒 Preserving essential files (index + glossary)...")
84
+ all_keep = []
85
+ for f in KEEP_FILES:
86
+ if os.path.exists(f):
87
+ print(f"✅ Keeping: {f}")
88
+ all_keep.append(f)
89
+ else:
90
+ print(f"⚠️ Missing expected file: {f}")
91
+ return all_keep
92
+
93
+ # --------------------------------------------------------------------
94
+ # 4. Upload cleaned files to dataset
95
+ # --------------------------------------------------------------------
96
+ def upload_to_hub(files):
97
+ if not token:
98
+ print("❌ No HF token found. Please add HF_TOKEN with write access.")
99
+ return
100
+ print(f"🚀 Uploading cleaned files to {REPO_ID} ...")
101
+ for f in files:
102
+ try:
103
+ upload_file(
104
+ path_or_fileobj=f,
105
+ path_in_repo=f,
106
+ repo_id=REPO_ID,
107
+ repo_type=REPO_TYPE,
108
+ token=token,
109
+ commit_message=f"Auto-cleanup sync {datetime.utcnow().isoformat()}"
110
+ )
111
+ print(f"✅ Uploaded: {f}")
112
+ except Exception as e:
113
+ print(f"⚠️ Failed to upload {f}: {e}")
114
+
115
+ # --------------------------------------------------------------------
116
+ # 5. Disk usage report
117
+ # --------------------------------------------------------------------
118
+ def report_usage():
119
+ print("\n📊 Disk Usage Summary:")
120
+ for path in ["persistent", "data", "/home/user"]:
121
+ if os.path.exists(path):
122
+ print(f"{path}: {readable_size(path)}")
123
+
124
+ # --------------------------------------------------------------------
125
+ # Run everything
126
+ # --------------------------------------------------------------------
127
+ if __name__ == "__main__":
128
+ start = time.time()
129
+ print("===== 🧹 Starting Space Cleanup =====")
130
+ clean_temp_and_cache()
131
+ trim_data()
132
+ files = preserve_key_files()
133
+ upload_to_hub(files)
134
+ report_usage()
135
+ print(f"\n✅ Cleanup finished in {time.time() - start:.2f}s")
gitattributes ADDED
@@ -0,0 +1,49 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ # ================================================
2
+ # ⚙️ Clinical Research Chatbot – Simplified .gitattributes
3
+ # ================================================
4
+ # Version: Safe for Hugging Face UI-only management
5
+ # (no Git LFS required)
6
+ # --------------------------------
7
+
8
+ # --------------------------------
9
+ # Code & Config Files (text mode)
10
+ # --------------------------------
11
+ *.py text eol=lf
12
+ *.txt text eol=lf
13
+ *.md text eol=lf
14
+ *.json text eol=lf
15
+ *.csv text eol=lf
16
+ *.yaml text eol=lf
17
+ *.yml text eol=lf
18
+ *.html text eol=lf
19
+ *.css text eol=lf
20
+ *.js text eol=lf
21
+ *.ini text eol=lf
22
+ *.cfg text eol=lf
23
+ *.toml text eol=lf
24
+ requirements.txt text eol=lf
25
+ runtime.txt text eol=lf
26
+ runtime.yaml text eol=lf
27
+ *.gitignore text eol=lf
28
+ *.gitattributes text eol=lf
29
+
30
+ # --------------------------------
31
+ # Binary & Data Files (no LFS)
32
+ # --------------------------------
33
+ *.pdf binary
34
+ *.docx binary
35
+ *.xlsx binary
36
+ *.zip binary
37
+ *.ppt binary
38
+ *.odt binary
39
+ *.png binary
40
+ *.jpg binary
41
+ *.jpeg binary
42
+ *.tif binary
43
+ *.tiff binary
44
+ *.gif binary
45
+
46
+ # --------------------------------
47
+ # Default handling
48
+ # --------------------------------
49
+ * text=auto eol=lf
gitignore ADDED
@@ -0,0 +1,71 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ # =========================================
2
+ # 🧪 Clinical Research Chatbot – .gitignore
3
+ # =========================================
4
+
5
+ # -------------------------
6
+ # Python
7
+ # -------------------------
8
+ __pycache__/
9
+ *.pyc
10
+ *.pyo
11
+ *.pyd
12
+ *.pkl
13
+ *.pickle
14
+
15
+ # -------------------------
16
+ # Environment / virtualenv
17
+ # -------------------------
18
+ .venv/
19
+ env/
20
+ venv/
21
+ ENV/
22
+ *.env
23
+
24
+ # -------------------------
25
+ # Data & Logs
26
+ # -------------------------
27
+ logs/*
28
+ !logs/.gitkeep
29
+ !logs/query_log.csv # keep recent chatbot logs
30
+
31
+ # -------------------------
32
+ # Data Folders
33
+ # -------------------------
34
+ # Keep reference docs & FAQs, ignore temporary files
35
+ data/public_docs/*
36
+ !data/public_docs/.gitkeep
37
+
38
+ data/faq/*
39
+ !data/faq/.gitkeep
40
+
41
+ # Glossary and metadata files should stay (important for chatbot)
42
+ !data/glossary.json
43
+ !data/faq_data.json
44
+ !data/clinical_faq.json
45
+
46
+ # Ignore temporary FAISS or index rebuilds
47
+ persistent/*
48
+ !persistent/.gitkeep
49
+ !persistent/faiss.index
50
+ !persistent/faiss.index.meta.json
51
+
52
+ # -------------------------
53
+ # Hugging Face + Transformers cache
54
+ # -------------------------
55
+ .cache/
56
+ datasets/
57
+ transformers_cache/
58
+ .huggingface/
59
+
60
+ # -------------------------
61
+ # IDE / Editor
62
+ # -------------------------
63
+ .vscode/
64
+ .idea/
65
+ .DS_Store
66
+
67
+ # -------------------------
68
+ # Miscellaneous
69
+ # -------------------------
70
+ *.tmp
71
+ *.bak
gitignore (1) ADDED
@@ -0,0 +1,71 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ # =========================================
2
+ # 🧪 Clinical Research Chatbot – .gitignore
3
+ # =========================================
4
+
5
+ # -------------------------
6
+ # Python
7
+ # -------------------------
8
+ __pycache__/
9
+ *.pyc
10
+ *.pyo
11
+ *.pyd
12
+ *.pkl
13
+ *.pickle
14
+
15
+ # -------------------------
16
+ # Environment / virtualenv
17
+ # -------------------------
18
+ .venv/
19
+ env/
20
+ venv/
21
+ ENV/
22
+ *.env
23
+
24
+ # -------------------------
25
+ # Data & Logs
26
+ # -------------------------
27
+ logs/*
28
+ !logs/.gitkeep
29
+ !logs/query_log.csv # keep recent chatbot logs
30
+
31
+ # -------------------------
32
+ # Data Folders
33
+ # -------------------------
34
+ # Keep reference docs & FAQs, ignore temporary files
35
+ data/public_docs/*
36
+ !data/public_docs/.gitkeep
37
+
38
+ data/faq/*
39
+ !data/faq/.gitkeep
40
+
41
+ # Glossary and metadata files should stay (important for chatbot)
42
+ !data/glossary.json
43
+ !data/faq_data.json
44
+ !data/clinical_faq.json
45
+
46
+ # Ignore temporary FAISS or index rebuilds
47
+ persistent/*
48
+ !persistent/.gitkeep
49
+ !persistent/faiss.index
50
+ !persistent/faiss.index.meta.json
51
+
52
+ # -------------------------
53
+ # Hugging Face + Transformers cache
54
+ # -------------------------
55
+ .cache/
56
+ datasets/
57
+ transformers_cache/
58
+ .huggingface/
59
+
60
+ # -------------------------
61
+ # IDE / Editor
62
+ # -------------------------
63
+ .vscode/
64
+ .idea/
65
+ .DS_Store
66
+
67
+ # -------------------------
68
+ # Miscellaneous
69
+ # -------------------------
70
+ *.tmp
71
+ *.bak
lfsconfig ADDED
@@ -0,0 +1,4 @@
 
 
 
 
 
1
+ [lfs]
2
+ url = https://huggingface.co/
3
+ locksverify = true
4
+ batch = true
postBuild ADDED
@@ -0,0 +1,60 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ #!/bin/bash
2
+ set -e
3
+
4
+ echo "🔧 PostBuild starting — optimizing CT-Chat Space..."
5
+
6
+ # -------------------------------------------------------
7
+ # 1️⃣ Fix dependency mismatches (Gradio & Websockets)
8
+ # -------------------------------------------------------
9
+ pip install --force-reinstall --no-cache-dir "websockets>=12" "gradio-client>=1.3.0"
10
+
11
+ # -------------------------------------------------------
12
+ # 2️⃣ Create and register shared NLTK data directory
13
+ # -------------------------------------------------------
14
+ echo "📁 Preparing shared NLTK data directory..."
15
+ export NLTK_DATA="/usr/local/share/nltk_data"
16
+ mkdir -p $NLTK_DATA
17
+ chmod -R 777 $NLTK_DATA
18
+
19
+ # -------------------------------------------------------
20
+ # 3️⃣ Preload all required NLTK resources (including punkt_tab)
21
+ # -------------------------------------------------------
22
+ echo "📦 Downloading NLTK resources..."
23
+ python -m nltk.downloader -d $NLTK_DATA \
24
+ punkt punkt_tab averaged_perceptron_tagger averaged_perceptron_tagger_eng stopwords wordnet omw-1.4
25
+
26
+ # -------------------------------------------------------
27
+ # 4️⃣ Verify NLTK installs and paths
28
+ # -------------------------------------------------------
29
+ python - <<'PYCODE'
30
+ import nltk, os
31
+ print(f"NLTK data path → {nltk.data.path}")
32
+ for pkg in ["punkt", "punkt_tab", "averaged_perceptron_tagger_eng", "stopwords", "wordnet"]:
33
+ try:
34
+ nltk.data.find(pkg)
35
+ print(f"✅ Verified NLTK resource: {pkg}")
36
+ except LookupError:
37
+ print(f"⚠️ Missing NLTK resource: {pkg}")
38
+ PYCODE
39
+
40
+ # -------------------------------------------------------
41
+ # 5️⃣ Clean caches (stay <50GB)
42
+ # -------------------------------------------------------
43
+ echo "🧹 Cleaning Hugging Face + Torch caches..."
44
+ rm -rf /root/.cache/* || true
45
+ rm -rf /home/user/.cache/* || true
46
+ rm -rf /usr/local/share/nltk_data/taggers/__pycache__ || true
47
+ rm -rf /home/user/app/hf_cache/* || true
48
+ rm -rf /home/user/app/logs/* || true
49
+
50
+ # -------------------------------------------------------
51
+ # 6️⃣ Ensure writable temporary cache for runtime
52
+ # -------------------------------------------------------
53
+ echo "📦 Preparing /tmp/hf_cache..."
54
+ mkdir -p /tmp/hf_cache
55
+ chmod -R 777 /tmp/hf_cache
56
+
57
+ # -------------------------------------------------------
58
+ # ✅ Done
59
+ # -------------------------------------------------------
60
+ echo "✅ PostBuild completed successfully — NLTK preloaded (punkt_tab OK), cache ready at /tmp/hf_cache."
requirements.txt ADDED
@@ -0,0 +1,43 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ # =======================================
2
+ # 🧪 Clinical Research Chatbot Requirements
3
+ # =======================================
4
+
5
+ # --- Core Libraries ---
6
+ faiss-cpu
7
+ torch
8
+ transformers
9
+ sentence-transformers
10
+ sentencepiece
11
+ fastapi
12
+ whoosh
13
+
14
+ # --- Data Handling ---
15
+ numpy
16
+ pandas
17
+ datasets
18
+
19
+ # --- Document Parsing ---
20
+ pymupdf
21
+ python-docx
22
+ openpyxl
23
+ beautifulsoup4
24
+ requests
25
+ aiofiles
26
+ rank-bm25
27
+
28
+ # --- NLP + Text Processing ---
29
+ nltk
30
+ scikit-learn
31
+ regex
32
+ tqdm
33
+
34
+ # --- Web + Interface ---
35
+ huggingface-hub>=0.23.0
36
+ gradio
37
+ gradio-client
38
+ uvicorn
39
+ spaces
40
+ python-multipart
41
+
42
+ # --- Networking / Compatibility Fix ---
43
+ websockets>=12
runtime.txt ADDED
@@ -0,0 +1 @@
 
 
1
+ python-3.10
runtime.yaml ADDED
@@ -0,0 +1,26 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ # =======================================
2
+ # ⚙️ Hugging Face Space Runtime Configuration
3
+ # =======================================
4
+
5
+ python: "3.10" # Stable for FAISS + Gradio + Transformers
6
+
7
+ # App entrypoint (FastAPI with Gradio mount)
8
+ entrypoint: "app:app"
9
+
10
+ hardware: "cpu-basic" # For small to medium FAISS indexes
11
+ # hardware: "cpu-upgrade" # Uncomment for larger index (>100 MB) or slower summaries
12
+
13
+ timeout: 600 # 10-minute build timeout
14
+ autoreload: true # Auto-reload app on file updates (optional)
15
+
16
+ # Cache persistent resources (prevents redownload)
17
+ cache:
18
+ - data/
19
+ - persistent/
20
+ - logs/
21
+
22
+ # Explicit build hook (optional, for clarity)
23
+ build:
24
+ commands:
25
+ - bash postBuild
26
+