Upload 5 files
Browse files- README.md +21 -14
- app.py +109 -0
- env.download +3 -0
- requirements.txt +8 -3
- service-account-key (1).json +13 -0
README.md
CHANGED
|
@@ -1,20 +1,27 @@
|
|
| 1 |
---
|
| 2 |
-
title:
|
| 3 |
-
emoji:
|
| 4 |
-
colorFrom:
|
| 5 |
-
colorTo:
|
| 6 |
-
sdk:
|
| 7 |
-
|
| 8 |
-
|
| 9 |
-
- streamlit
|
| 10 |
pinned: false
|
| 11 |
-
short_description: Clinical Trial Discovery System by RAG
|
| 12 |
-
license: mit
|
| 13 |
---
|
| 14 |
|
| 15 |
-
# Welcome to Streamlit!
|
| 16 |
|
| 17 |
-
|
| 18 |
|
| 19 |
-
|
| 20 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
---
|
| 2 |
+
title: Clinical Trial Discovery App
|
| 3 |
+
emoji: π§¬
|
| 4 |
+
colorFrom: blue
|
| 5 |
+
colorTo: indigo
|
| 6 |
+
sdk: streamlit
|
| 7 |
+
sdk_version: 1.46.1
|
| 8 |
+
app_file: app.py
|
|
|
|
| 9 |
pinned: false
|
|
|
|
|
|
|
| 10 |
---
|
| 11 |
|
|
|
|
| 12 |
|
| 13 |
+
# π§ Clinical Trial Discovery (RAG App)
|
| 14 |
|
| 15 |
+
This is a Streamlit-based RAG (Retrieval-Augmented Generation) app for discovering clinical trials using natural language queries. It uses:
|
| 16 |
+
|
| 17 |
+
- β
BioBERT-SBERT for semantic embeddings
|
| 18 |
+
- β
Pinecone for fast vector search
|
| 19 |
+
- β
Mistral-7B-Instruct for generating answers - This might change
|
| 20 |
+
- β
Firestore for storing user bookmarks only
|
| 21 |
+
|
| 22 |
+
## π Features
|
| 23 |
+
|
| 24 |
+
- π Ask natural questions (e.g., "thymus cancer for women")
|
| 25 |
+
- π§ Get LLM-generated answers based on real trial descriptions
|
| 26 |
+
- β Bookmark trials (saved to Firestore)
|
| 27 |
+
- π§ Tabbed layout for easy navigation
|
app.py
ADDED
|
@@ -0,0 +1,109 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
import streamlit as st # for web UI creation
|
| 2 |
+
from sentence_transformers import SentenceTransformer # this is for embedding queries into dense vectors
|
| 3 |
+
from pinecone import Pinecone, ServerlessSpec # for accessing pinecone vector DB
|
| 4 |
+
import os # for readhing environment variable
|
| 5 |
+
from langchain_huggingface import HuggingFaceEndpoint # for accessing HuggingFace inference endpoint
|
| 6 |
+
from langchain.prompts import PromptTemplate
|
| 7 |
+
import firebase_admin # for access to firebase
|
| 8 |
+
from firebase_admin import credentials, firestore
|
| 9 |
+
from dotenv import load_dotenv
|
| 10 |
+
|
| 11 |
+
# === Load environment variables ===
|
| 12 |
+
load_dotenv(".env.local")
|
| 13 |
+
|
| 14 |
+
# === CONFIG ===
|
| 15 |
+
PINECONE_API_KEY = os.getenv("PINECONE_API_KEY")
|
| 16 |
+
PINECONE_ENV = os.getenv("PINECONE_ENV")
|
| 17 |
+
HF_TOKEN = os.getenv("HUGGINGFACEHUB_API_TOKEN")
|
| 18 |
+
USER_ID = "demo_user" # static user for testing
|
| 19 |
+
|
| 20 |
+
# === Firebase Setup ===
|
| 21 |
+
if not firebase_admin._apps:
|
| 22 |
+
cred = credentials.Certificate("service-account-key.json")
|
| 23 |
+
firebase_admin.initialize_app(cred)
|
| 24 |
+
db = firestore.client()
|
| 25 |
+
|
| 26 |
+
# === Pinecone Setup ===
|
| 27 |
+
pc = Pinecone(api_key=PINECONE_API_KEY)
|
| 28 |
+
INDEX_NAME = "clinical-trials-rag"
|
| 29 |
+
index = pc.Index(INDEX_NAME)
|
| 30 |
+
|
| 31 |
+
# === Embedding Model ===
|
| 32 |
+
embed_model = SentenceTransformer("pritamdeka/BioBERT-mnli-snli-scinli-scitail-mednli-stsb") # BioBERT sentence transformer model
|
| 33 |
+
|
| 34 |
+
# === LLM Setup ===
|
| 35 |
+
llm = HuggingFaceEndpoint(
|
| 36 |
+
endpoint_url="https://f9eftfrz5qna6j32.us-east-1.aws.endpoints.huggingface.cloud", # Inference Endpoint Built from Hugging Face. Pay per hour.
|
| 37 |
+
huggingfacehub_api_token=HF_TOKEN,
|
| 38 |
+
temperature=0.7,
|
| 39 |
+
max_new_tokens=256
|
| 40 |
+
)
|
| 41 |
+
prompt_template = PromptTemplate.from_template(
|
| 42 |
+
"Context:\n{context}\n\nQuestion: {question}\nAnswer:"
|
| 43 |
+
)
|
| 44 |
+
|
| 45 |
+
# === Tabs ===
|
| 46 |
+
tab1, tab2 = st.tabs(["π Ask a Question", "β Bookmarked Trials"])
|
| 47 |
+
|
| 48 |
+
# === TAB 1: Question tab page with sample questions to guide the user ===
|
| 49 |
+
with tab1:
|
| 50 |
+
st.title("Clinical Trial Discovery")
|
| 51 |
+
|
| 52 |
+
st.markdown("""
|
| 53 |
+
π‘ **Example question formats:**
|
| 54 |
+
- What clinical trials are available for non-small cell lung cancer in California?
|
| 55 |
+
- List phase 3 trials for Type 1 Diabetes recruiting in 2025.
|
| 56 |
+
- What studies on immunotherapy for melanoma are active in Europe?
|
| 57 |
+
- Are there trials targeting heart disease patients over 65?
|
| 58 |
+
""")
|
| 59 |
+
|
| 60 |
+
user_query = st.text_input("π Enter your clinical trial questions below:") # actual query input part
|
| 61 |
+
|
| 62 |
+
if user_query: # triggers query upon user type action
|
| 63 |
+
with st.spinner("Retrieving relevant trials..."): # display spinner while pinecone DB being searched
|
| 64 |
+
vec = embed_model.encode(user_query).tolist() # embed query using the BioBERT sentence transformer
|
| 65 |
+
results = index.query(vector=vec, top_k=5, include_metadata=True) # search pinecone vector DB. Look for 5 most similar vectors
|
| 66 |
+
contexts = [r["metadata"]["text"] for r in results["matches"]]
|
| 67 |
+
nct_ids = [r["metadata"].get("nct_id", "") for r in results["matches"]]
|
| 68 |
+
|
| 69 |
+
# Prep the prompt for the LLM
|
| 70 |
+
joined_context = "\n".join(contexts) # joins the retrieved trial summary into one contextual block
|
| 71 |
+
prompt = prompt_template.format(context=joined_context, question=user_query) # fills prompt template
|
| 72 |
+
|
| 73 |
+
# this part calls the LLM endpoint to generate the answer
|
| 74 |
+
with st.spinner("Generating answer..."):
|
| 75 |
+
answer = llm(prompt)
|
| 76 |
+
st.subheader("π§ Answer:") # display answer in UI
|
| 77 |
+
st.write(answer)
|
| 78 |
+
|
| 79 |
+
st.markdown("---") # display the related trials under the answer
|
| 80 |
+
st.subheader("π Related Clinical Trials")
|
| 81 |
+
|
| 82 |
+
for i, match in enumerate(results["matches"]): # loop through pinecone search results and display them
|
| 83 |
+
meta = match["metadata"]
|
| 84 |
+
nct_id = meta.get("nct_id", f"chunk_{i}") # assigns fallback chuck ID if 'nct_id' is missing
|
| 85 |
+
chunk_text = meta.get("text", "")[:400] # shows the first 400 characters of the trial chunk
|
| 86 |
+
with st.expander(f"Trial: {nct_id}"): # create an expandable block for each trial
|
| 87 |
+
st.write(chunk_text + "...")
|
| 88 |
+
# add bookmark button instead each expander. Book marks are saved to /users/demo_user/Bookmarks/{nct_id}
|
| 89 |
+
if st.button(f"β Bookmark {nct_id}", key=f"bookmark_{i}"):
|
| 90 |
+
db.collection("Users").document(USER_ID).collection("Bookmarks").document(nct_id).set({
|
| 91 |
+
"nct_id": nct_id,
|
| 92 |
+
"text": chunk_text
|
| 93 |
+
})
|
| 94 |
+
st.success(f"Bookmarked {nct_id} to Firestore.")
|
| 95 |
+
|
| 96 |
+
# === TAB 2: Bookmarked Trials ===
|
| 97 |
+
with tab2:
|
| 98 |
+
st.title("β Your Bookmarked Trials")
|
| 99 |
+
# retrieve bookmarks from firestore
|
| 100 |
+
docs = db.collection("Users").document(USER_ID).collection("Bookmarks").stream()
|
| 101 |
+
bookmarks = [doc.to_dict() for doc in docs]
|
| 102 |
+
|
| 103 |
+
# if no bookmarks, show message.
|
| 104 |
+
if not bookmarks:
|
| 105 |
+
st.info("You haven't bookmarked any trials yet.")
|
| 106 |
+
else: # otherwise display bookmarked trials in expanders
|
| 107 |
+
for b in bookmarks:
|
| 108 |
+
with st.expander(f"{b['nct_id']}"):
|
| 109 |
+
st.write(b["text"])
|
env.download
ADDED
|
@@ -0,0 +1,3 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
PINECONE_API_KEY=pcsk_5xJwr6_GLYcR7tgYqCdH1AzCai33hLoESJiw2kQhQjFXfVNguWchmrq4DHrtKuKdWPKsy
|
| 2 |
+
PINECONE_ENV=us-east-1
|
| 3 |
+
INDEX_NAME=clinical-trials-rag
|
requirements.txt
CHANGED
|
@@ -1,3 +1,8 @@
|
|
| 1 |
-
|
| 2 |
-
|
| 3 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
streamlit
|
| 2 |
+
sentence-transformers
|
| 3 |
+
transformers
|
| 4 |
+
langchain
|
| 5 |
+
langchain-community
|
| 6 |
+
langchain-huggingface
|
| 7 |
+
firebase-admin
|
| 8 |
+
pinecone>=3.0.0
|
service-account-key (1).json
ADDED
|
@@ -0,0 +1,13 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
{
|
| 2 |
+
"type": "service_account",
|
| 3 |
+
"project_id": "clinical-trial-app-3ecb4",
|
| 4 |
+
"private_key_id": "81877eea413e50fb90f5ab80ac2cfb8ca3ebc2a5",
|
| 5 |
+
"private_key": "-----BEGIN PRIVATE KEY-----\nMIIEvAIBADANBgkqhkiG9w0BAQEFAASCBKYwggSiAgEAAoIBAQD1CyaZWwKwcdYz\nKD6WWav6BojZ7SjSRmOCp/gICcTnNELoJgXSiff592cYCYsXZfFkqlrtvqIFzxQV\nOFUeQD5t7vHsaFiqQ+OWBWAk8fcW5UXa7VyjztaWFxm1BRLkGlVv+dIAvpQPqLaY\n8X7GD22qv/k0MDWNcXWyew9HSa77gOE8fuoSHStRRurWKv8yrmcsFqtLAmBMMeOG\nrfRb0gnLaSd9hDM9iM81BCCX+yOT1bvgnWwVSoa1TOJgaOR0Rg0JjG75myGRNDLW\nTJC7EoBEuwhbL6bKJLZwg3Kt7cFK3n1CVkIrSFPEIbr+KYSNUZgEZePRiaiAMhRZ\nBCPE6UFpAgMBAAECggEAY0guTmoib8zfpYCDwzT3yeiF4A4HwRKF1PUrOE6E+cwh\n458sMr51u1By87lCIqvKHygqiTL4wHPDRkm5qvUKlk3+tPMeIXY6Kplo9+8VVrB0\nGhybsT7nkI4xezdgDQ1iWmkUnfqZULLhsJv/6k1r+Iepd2yk9fbnYfYcHZ0MCZfG\nNSPGQ/H71z+SdzZkZqCcYYQm2lKfsYfHOpwGGz1MVcaskVUNRWPspkIUwRSQ43mY\nb+9U80yAKWVRwLUBSbiz/FK1/hMvlzZoNjoHpqY6/AuwTBiitfLWmIeqgHjSu0/z\ndt6cBSdAvagAIprQ/qC1bUq3SylvZEFIi8UlkBzhNQKBgQD60Y60BuadSXZu8uGZ\nQU3QjsLksHXjDut5YmJyIjcYrYh9Cg54lJ5/z/um7WKdiKxNxBSjUS9NLbkfa5oP\nL2iE0Vsq+WsHfnKHpMg6R2bcc5R8n2QWyZGzsrVCWeYq+o5cjR78ENHcxvmDJkrd\nvJZvJxxXmY3E7AJRo6vBGXwmPwKBgQD6Gw1o+vwX4G0xbA6BJl1ZTjx4PvajNFo9\nMPX+stt+O5J0NcyR/8Kn6t0ef6a5qWCPClMTJpEGWwzLX2lwDzZ24nKNVT1rX26i\nsYnpU87LfMjUwd6k44ydCLHmyN3vh/bmziC2+VVUUgVAfEegsti3/Ihfg5x+oUNV\ne+Ctqks+VwKBgE6i0s3IeBcKCDKivW4yFjZz+9B9Loigjd0BpoHIDmQTS/5/36eY\nWNUTnP9p34gqaHL9LcdCVcUpt6eNMcDfCTLS/HVNu2ufDkNOu2PiLPKi3gPwaQ3n\n5mFjfwatbsc8xNNpfzRiBZnlXCbtI32/eZ6hsXYZc2Qw5k04NkoVNmI7AoGAOGCi\nvDhfXS58zrgx6NDyF/B31w8yX4WslcCUow5ERgc9sy5xZ7PEeD+MCpTxy0Yv+u5z\n3YxDArDBiJKAXP9A4rmW4t8FElAXy1rD4LHAmsQNLVBqVLbqend8Sq6awKTgdhSe\n8T/xCSnX/zpElyfZjFfDkexD+ZN2by2Wbu9FOM8CgYAAzEBP3KZLFLY9oRwchK6u\ngqDTHGfEY1ui6UVqRGUslufG7d8FkU7LW4VPp1s25Xn8RQB1B0Q+LMUYyhFkoT6v\n9IkHwp9rKDwUsYH6puWbCnx3BglX8U/P0f2aaa9Zqxjnm/3SNuw1dparBXnjshN1\ntuY6ALbo6LRFGOyaJUz9bg==\n-----END PRIVATE KEY-----\n",
|
| 6 |
+
"client_email": "firebase-adminsdk-fbsvc@clinical-trial-app-3ecb4.iam.gserviceaccount.com",
|
| 7 |
+
"client_id": "101962704350860272575",
|
| 8 |
+
"auth_uri": "https://accounts.google.com/o/oauth2/auth",
|
| 9 |
+
"token_uri": "https://oauth2.googleapis.com/token",
|
| 10 |
+
"auth_provider_x509_cert_url": "https://www.googleapis.com/oauth2/v1/certs",
|
| 11 |
+
"client_x509_cert_url": "https://www.googleapis.com/robot/v1/metadata/x509/firebase-adminsdk-fbsvc%40clinical-trial-app-3ecb4.iam.gserviceaccount.com",
|
| 12 |
+
"universe_domain": "googleapis.com"
|
| 13 |
+
}
|