AlphaAngel444 commited on
Commit
798fe27
Β·
verified Β·
1 Parent(s): d075535

Upload 5 files

Browse files
Files changed (5) hide show
  1. README.md +21 -14
  2. app.py +109 -0
  3. env.download +3 -0
  4. requirements.txt +8 -3
  5. service-account-key (1).json +13 -0
README.md CHANGED
@@ -1,20 +1,27 @@
1
  ---
2
- title: ClinicalTrialDiscoveryRAGProject
3
- emoji: πŸš€
4
- colorFrom: red
5
- colorTo: red
6
- sdk: docker
7
- app_port: 8501
8
- tags:
9
- - streamlit
10
  pinned: false
11
- short_description: Clinical Trial Discovery System by RAG
12
- license: mit
13
  ---
14
 
15
- # Welcome to Streamlit!
16
 
17
- Edit `/src/streamlit_app.py` to customize this app to your heart's desire. :heart:
18
 
19
- If you have any questions, checkout our [documentation](https://docs.streamlit.io) and [community
20
- forums](https://discuss.streamlit.io).
 
 
 
 
 
 
 
 
 
 
 
 
1
  ---
2
+ title: Clinical Trial Discovery App
3
+ emoji: 🧬
4
+ colorFrom: blue
5
+ colorTo: indigo
6
+ sdk: streamlit
7
+ sdk_version: 1.46.1
8
+ app_file: app.py
 
9
  pinned: false
 
 
10
  ---
11
 
 
12
 
13
+ # 🧠 Clinical Trial Discovery (RAG App)
14
 
15
+ This is a Streamlit-based RAG (Retrieval-Augmented Generation) app for discovering clinical trials using natural language queries. It uses:
16
+
17
+ - βœ… BioBERT-SBERT for semantic embeddings
18
+ - βœ… Pinecone for fast vector search
19
+ - βœ… Mistral-7B-Instruct for generating answers - This might change
20
+ - βœ… Firestore for storing user bookmarks only
21
+
22
+ ## πŸš€ Features
23
+
24
+ - πŸ” Ask natural questions (e.g., "thymus cancer for women")
25
+ - 🧠 Get LLM-generated answers based on real trial descriptions
26
+ - ⭐ Bookmark trials (saved to Firestore)
27
+ - 🧭 Tabbed layout for easy navigation
app.py ADDED
@@ -0,0 +1,109 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import streamlit as st # for web UI creation
2
+ from sentence_transformers import SentenceTransformer # this is for embedding queries into dense vectors
3
+ from pinecone import Pinecone, ServerlessSpec # for accessing pinecone vector DB
4
+ import os # for readhing environment variable
5
+ from langchain_huggingface import HuggingFaceEndpoint # for accessing HuggingFace inference endpoint
6
+ from langchain.prompts import PromptTemplate
7
+ import firebase_admin # for access to firebase
8
+ from firebase_admin import credentials, firestore
9
+ from dotenv import load_dotenv
10
+
11
+ # === Load environment variables ===
12
+ load_dotenv(".env.local")
13
+
14
+ # === CONFIG ===
15
+ PINECONE_API_KEY = os.getenv("PINECONE_API_KEY")
16
+ PINECONE_ENV = os.getenv("PINECONE_ENV")
17
+ HF_TOKEN = os.getenv("HUGGINGFACEHUB_API_TOKEN")
18
+ USER_ID = "demo_user" # static user for testing
19
+
20
+ # === Firebase Setup ===
21
+ if not firebase_admin._apps:
22
+ cred = credentials.Certificate("service-account-key.json")
23
+ firebase_admin.initialize_app(cred)
24
+ db = firestore.client()
25
+
26
+ # === Pinecone Setup ===
27
+ pc = Pinecone(api_key=PINECONE_API_KEY)
28
+ INDEX_NAME = "clinical-trials-rag"
29
+ index = pc.Index(INDEX_NAME)
30
+
31
+ # === Embedding Model ===
32
+ embed_model = SentenceTransformer("pritamdeka/BioBERT-mnli-snli-scinli-scitail-mednli-stsb") # BioBERT sentence transformer model
33
+
34
+ # === LLM Setup ===
35
+ llm = HuggingFaceEndpoint(
36
+ endpoint_url="https://f9eftfrz5qna6j32.us-east-1.aws.endpoints.huggingface.cloud", # Inference Endpoint Built from Hugging Face. Pay per hour.
37
+ huggingfacehub_api_token=HF_TOKEN,
38
+ temperature=0.7,
39
+ max_new_tokens=256
40
+ )
41
+ prompt_template = PromptTemplate.from_template(
42
+ "Context:\n{context}\n\nQuestion: {question}\nAnswer:"
43
+ )
44
+
45
+ # === Tabs ===
46
+ tab1, tab2 = st.tabs(["πŸ” Ask a Question", "⭐ Bookmarked Trials"])
47
+
48
+ # === TAB 1: Question tab page with sample questions to guide the user ===
49
+ with tab1:
50
+ st.title("Clinical Trial Discovery")
51
+
52
+ st.markdown("""
53
+ πŸ’‘ **Example question formats:**
54
+ - What clinical trials are available for non-small cell lung cancer in California?
55
+ - List phase 3 trials for Type 1 Diabetes recruiting in 2025.
56
+ - What studies on immunotherapy for melanoma are active in Europe?
57
+ - Are there trials targeting heart disease patients over 65?
58
+ """)
59
+
60
+ user_query = st.text_input("πŸ” Enter your clinical trial questions below:") # actual query input part
61
+
62
+ if user_query: # triggers query upon user type action
63
+ with st.spinner("Retrieving relevant trials..."): # display spinner while pinecone DB being searched
64
+ vec = embed_model.encode(user_query).tolist() # embed query using the BioBERT sentence transformer
65
+ results = index.query(vector=vec, top_k=5, include_metadata=True) # search pinecone vector DB. Look for 5 most similar vectors
66
+ contexts = [r["metadata"]["text"] for r in results["matches"]]
67
+ nct_ids = [r["metadata"].get("nct_id", "") for r in results["matches"]]
68
+
69
+ # Prep the prompt for the LLM
70
+ joined_context = "\n".join(contexts) # joins the retrieved trial summary into one contextual block
71
+ prompt = prompt_template.format(context=joined_context, question=user_query) # fills prompt template
72
+
73
+ # this part calls the LLM endpoint to generate the answer
74
+ with st.spinner("Generating answer..."):
75
+ answer = llm(prompt)
76
+ st.subheader("🧠 Answer:") # display answer in UI
77
+ st.write(answer)
78
+
79
+ st.markdown("---") # display the related trials under the answer
80
+ st.subheader("πŸ“‹ Related Clinical Trials")
81
+
82
+ for i, match in enumerate(results["matches"]): # loop through pinecone search results and display them
83
+ meta = match["metadata"]
84
+ nct_id = meta.get("nct_id", f"chunk_{i}") # assigns fallback chuck ID if 'nct_id' is missing
85
+ chunk_text = meta.get("text", "")[:400] # shows the first 400 characters of the trial chunk
86
+ with st.expander(f"Trial: {nct_id}"): # create an expandable block for each trial
87
+ st.write(chunk_text + "...")
88
+ # add bookmark button instead each expander. Book marks are saved to /users/demo_user/Bookmarks/{nct_id}
89
+ if st.button(f"⭐ Bookmark {nct_id}", key=f"bookmark_{i}"):
90
+ db.collection("Users").document(USER_ID).collection("Bookmarks").document(nct_id).set({
91
+ "nct_id": nct_id,
92
+ "text": chunk_text
93
+ })
94
+ st.success(f"Bookmarked {nct_id} to Firestore.")
95
+
96
+ # === TAB 2: Bookmarked Trials ===
97
+ with tab2:
98
+ st.title("⭐ Your Bookmarked Trials")
99
+ # retrieve bookmarks from firestore
100
+ docs = db.collection("Users").document(USER_ID).collection("Bookmarks").stream()
101
+ bookmarks = [doc.to_dict() for doc in docs]
102
+
103
+ # if no bookmarks, show message.
104
+ if not bookmarks:
105
+ st.info("You haven't bookmarked any trials yet.")
106
+ else: # otherwise display bookmarked trials in expanders
107
+ for b in bookmarks:
108
+ with st.expander(f"{b['nct_id']}"):
109
+ st.write(b["text"])
env.download ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ PINECONE_API_KEY=pcsk_5xJwr6_GLYcR7tgYqCdH1AzCai33hLoESJiw2kQhQjFXfVNguWchmrq4DHrtKuKdWPKsy
2
+ PINECONE_ENV=us-east-1
3
+ INDEX_NAME=clinical-trials-rag
requirements.txt CHANGED
@@ -1,3 +1,8 @@
1
- altair
2
- pandas
3
- streamlit
 
 
 
 
 
 
1
+ streamlit
2
+ sentence-transformers
3
+ transformers
4
+ langchain
5
+ langchain-community
6
+ langchain-huggingface
7
+ firebase-admin
8
+ pinecone>=3.0.0
service-account-key (1).json ADDED
@@ -0,0 +1,13 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ {
2
+ "type": "service_account",
3
+ "project_id": "clinical-trial-app-3ecb4",
4
+ "private_key_id": "81877eea413e50fb90f5ab80ac2cfb8ca3ebc2a5",
5
+ "private_key": "-----BEGIN PRIVATE KEY-----\nMIIEvAIBADANBgkqhkiG9w0BAQEFAASCBKYwggSiAgEAAoIBAQD1CyaZWwKwcdYz\nKD6WWav6BojZ7SjSRmOCp/gICcTnNELoJgXSiff592cYCYsXZfFkqlrtvqIFzxQV\nOFUeQD5t7vHsaFiqQ+OWBWAk8fcW5UXa7VyjztaWFxm1BRLkGlVv+dIAvpQPqLaY\n8X7GD22qv/k0MDWNcXWyew9HSa77gOE8fuoSHStRRurWKv8yrmcsFqtLAmBMMeOG\nrfRb0gnLaSd9hDM9iM81BCCX+yOT1bvgnWwVSoa1TOJgaOR0Rg0JjG75myGRNDLW\nTJC7EoBEuwhbL6bKJLZwg3Kt7cFK3n1CVkIrSFPEIbr+KYSNUZgEZePRiaiAMhRZ\nBCPE6UFpAgMBAAECggEAY0guTmoib8zfpYCDwzT3yeiF4A4HwRKF1PUrOE6E+cwh\n458sMr51u1By87lCIqvKHygqiTL4wHPDRkm5qvUKlk3+tPMeIXY6Kplo9+8VVrB0\nGhybsT7nkI4xezdgDQ1iWmkUnfqZULLhsJv/6k1r+Iepd2yk9fbnYfYcHZ0MCZfG\nNSPGQ/H71z+SdzZkZqCcYYQm2lKfsYfHOpwGGz1MVcaskVUNRWPspkIUwRSQ43mY\nb+9U80yAKWVRwLUBSbiz/FK1/hMvlzZoNjoHpqY6/AuwTBiitfLWmIeqgHjSu0/z\ndt6cBSdAvagAIprQ/qC1bUq3SylvZEFIi8UlkBzhNQKBgQD60Y60BuadSXZu8uGZ\nQU3QjsLksHXjDut5YmJyIjcYrYh9Cg54lJ5/z/um7WKdiKxNxBSjUS9NLbkfa5oP\nL2iE0Vsq+WsHfnKHpMg6R2bcc5R8n2QWyZGzsrVCWeYq+o5cjR78ENHcxvmDJkrd\nvJZvJxxXmY3E7AJRo6vBGXwmPwKBgQD6Gw1o+vwX4G0xbA6BJl1ZTjx4PvajNFo9\nMPX+stt+O5J0NcyR/8Kn6t0ef6a5qWCPClMTJpEGWwzLX2lwDzZ24nKNVT1rX26i\nsYnpU87LfMjUwd6k44ydCLHmyN3vh/bmziC2+VVUUgVAfEegsti3/Ihfg5x+oUNV\ne+Ctqks+VwKBgE6i0s3IeBcKCDKivW4yFjZz+9B9Loigjd0BpoHIDmQTS/5/36eY\nWNUTnP9p34gqaHL9LcdCVcUpt6eNMcDfCTLS/HVNu2ufDkNOu2PiLPKi3gPwaQ3n\n5mFjfwatbsc8xNNpfzRiBZnlXCbtI32/eZ6hsXYZc2Qw5k04NkoVNmI7AoGAOGCi\nvDhfXS58zrgx6NDyF/B31w8yX4WslcCUow5ERgc9sy5xZ7PEeD+MCpTxy0Yv+u5z\n3YxDArDBiJKAXP9A4rmW4t8FElAXy1rD4LHAmsQNLVBqVLbqend8Sq6awKTgdhSe\n8T/xCSnX/zpElyfZjFfDkexD+ZN2by2Wbu9FOM8CgYAAzEBP3KZLFLY9oRwchK6u\ngqDTHGfEY1ui6UVqRGUslufG7d8FkU7LW4VPp1s25Xn8RQB1B0Q+LMUYyhFkoT6v\n9IkHwp9rKDwUsYH6puWbCnx3BglX8U/P0f2aaa9Zqxjnm/3SNuw1dparBXnjshN1\ntuY6ALbo6LRFGOyaJUz9bg==\n-----END PRIVATE KEY-----\n",
6
+ "client_email": "firebase-adminsdk-fbsvc@clinical-trial-app-3ecb4.iam.gserviceaccount.com",
7
+ "client_id": "101962704350860272575",
8
+ "auth_uri": "https://accounts.google.com/o/oauth2/auth",
9
+ "token_uri": "https://oauth2.googleapis.com/token",
10
+ "auth_provider_x509_cert_url": "https://www.googleapis.com/oauth2/v1/certs",
11
+ "client_x509_cert_url": "https://www.googleapis.com/robot/v1/metadata/x509/firebase-adminsdk-fbsvc%40clinical-trial-app-3ecb4.iam.gserviceaccount.com",
12
+ "universe_domain": "googleapis.com"
13
+ }