NiranjanSathish's picture
Upload 2 files
965e103 verified
import dotenv
# Load environment variables from .env file
dotenv.load_dotenv()
import streamlit as st
import os
import sys
import pickle
import numpy as np
import spacy # Added to explicitly check for spacy model loading
# --- Custom CSS for reduced whitespace and colors ---
st.markdown(
"""
<style>
/* Reduce top padding for the main Streamlit app container */
.stApp {
padding-top: 0px; /* Reduced this value to minimize whitespace at the very top */
padding-bottom: 20px;
}
/* Set a subtle background color for the entire page */
body {
background-color: #f0f8ff; /* AliceBlue - a very light blue */
color: #333333; /* Dark gray for text */
}
/* Style for headers */
h1, h2, h3, h4, h5, h6 {
color: #1a5276; /* Darker blue for headings */
}
/* Style for buttons */
.stButton>button {
background-color: #28a745; /* Green for primary button */
color: white;
border-radius: 8px;
padding: 10px 20px;
border: none;
box-shadow: 2px 2px 5px rgba(0,0,0,0.2);
transition: background-color 0.3s ease;
}
.stButton>button:hover {
background-color: #218838; /* Darker green on hover */
}
/* Style for text areas and select boxes */
.stTextArea textarea, .stSelectbox [data-testid="stSelectbox"] {
border-radius: 8px;
border: 1px solid #cccccc;
}
/* Style for info, success, warning, error boxes */
.stAlert {
border-radius: 8px;
}
</style>
""",
unsafe_allow_html=True
)
# --- Global message log ---
# This list will store messages to be displayed in the log expander
app_messages = []
def log_message(type, message):
"""
Helper function to append messages to the log list and display them prominently
based on their type.
"""
app_messages.append((type, message))
if type == "error":
st.error(message)
# Add the 'Scripts' directory to the Python path
# This allows importing modules like Query_processing, Retrieval, and Answer_Generation
script_dir = os.path.join(os.path.dirname(__file__), 'Scripts')
log_message("info", f"Attempting to add '{script_dir}' to Python path.")
if script_dir not in sys.path:
sys.path.append(script_dir)
log_message("info", f"'{script_dir}' added to sys.path.")
else:
log_message("info", f"'{script_dir}' already in sys.path.")
# --- Debugging: Check if script files exist ---
script_files_to_check = {
"Query_processing.py": False,
"Retrieval.py": False,
"Answer_Generation.py": False
}
all_scripts_found = True
for script_name in script_files_to_check:
script_path = os.path.join(script_dir, script_name)
if os.path.exists(script_path):
script_files_to_check[script_name] = True
else:
all_scripts_found = False
log_message("error", f"Error: Script file not found at expected path: {script_path}")
if not all_scripts_found:
log_message("error", "One or more essential script files are missing from the 'Scripts' directory. "
"Please ensure your project structure is correct.")
st.stop() # Stop execution if critical files are missing
# Import your core logic modules
try:
from Query_processing import preprocess_query
from Retrieval import Retrieval_averagedQP
from Answer_Generation import answer_generation
log_message("success", "Core modules imported successfully!")
except ImportError as e:
log_message("error", f"Error importing core modules. Make sure 'Scripts' directory is correctly structured and contains "
f"Query_processing.py, Retrieval.py, and Answer_Generation.py. Error: {e}")
st.stop()
# --- Configuration ---
# Set page configuration for a wider layout
st.set_page_config(layout="wide", page_title="Drugbot!", page_icon="💊")
# Define paths to your data and vectors
# These paths are relative to the app.py location
DATASET_PATH = os.path.join(os.path.dirname(__file__), 'Datasets', 'flattened_drug_dataset_cleaned.csv')
VECTORS_DIR = os.path.join(os.path.dirname(__file__), 'Vectors')
FAISS_INDEX_PATH = os.path.join(VECTORS_DIR, 'faiss_index.idx')
DOC_METADATA_PATH = os.path.join(VECTORS_DIR, 'doc_metadata.pkl')
DOC_VECTORS_PATH = os.path.join(VECTORS_DIR, 'doc_vectors.npy')
# --- Cached Resources ---
# Use st.cache_resource to load heavy models and data only once
@st.cache_resource
def load_all_assets():
"""
Verifies the existence of necessary files and attempts to load core NLP models.
This function will be run only once across all user sessions.
"""
with st.spinner("Verifying medical knowledge base and models... This might take a moment."):
try:
# 1. Check for presence of FAISS and embedding files
if not os.path.exists(FAISS_INDEX_PATH):
log_message("error", f"Missing FAISS index file: {FAISS_INDEX_PATH}")
return False
if not os.path.exists(DOC_METADATA_PATH):
log_message("error", f"Missing document metadata file: {DOC_METADATA_PATH}")
return False
if not os.path.exists(DOC_VECTORS_PATH):
log_message("error", f"Missing document vectors file: {DOC_VECTORS_PATH}")
return False
# 2. Attempt to load the SciSpaCy model (if Query_processing doesn't handle it globally)
# This is a common point of failure, so we'll explicitly check.
# Assuming 'en_core_sci_md' is the model name.
try:
# If spacy.load() is called multiple times, it might cause issues.
# It's better if Query_processing handles its own model loading once.
# This check is just to ensure the model is loadable.
# nlp = spacy.load("en_core_sci_md")
# del nlp # Release the model if it's not needed globally here
log_message("info", "SciSpaCy 'en_core_sci_md' model is expected to be loaded by Query_processing.")
except OSError:
log_message("error", "SciSpaCy 'en_core_sci_md' model not found or linked. "
"Please ensure it's installed correctly (e.g., `pip install https://s3-us-west-2.amazonaws.com/ai2-s2-scispacy/releases/v0.5.4/en_core_sci_md-0.5.4.tar.gz`).")
return False
except Exception as e:
log_message("error", f"An unexpected error occurred while checking SciSpaCy model: {e}")
return False
log_message("success", "Medical knowledge base files verified. Models will be loaded as needed.")
return True # Indicate successful verification
except Exception as e:
log_message("error", f"Failed to verify assets. Please ensure all data and vector files are in their correct paths. Error: {e}")
return False
# Load all assets at the start of the application
assets_loaded = load_all_assets()
# --- Title and Header ---
st.title("💊 DrugBot")
st.markdown("---")
# --- Instructions ---
# This section is already placed directly after the title and horizontal rule.
st.header("How to Use:")
st.write(
"""
Welcome to DrugBot - Retrieval based Medical Drug QA Chatbot! You can ask questions about medical drugs, and I will retrieve
information from a verified database to provide accurate answers.
1. **Select an example query** from the dropdown or **type your own question** in the text area below.
2. Click the **"Get Answer"** button.
3. Wait for the chatbot to process your query and generate an answer.
"""
)
st.markdown("---")
# --- Example Queries ---
st.header("Try These Examples:")
example_queries = [
"Select an example query...",
"What is the dosage for Azithromycin?",
"What are the side effects of Ibuprofen?",
"How should I take Amoxicillin?",
"What are the precautions for Warfarin?",
"What are the drug interactions for Metformin?",
"What is Paracetamol used for?",
"Can pregnant women take Aspirin?",
"How does Prednisone work?",
"What is the recommended dose for children for Tylenol?"
]
selected_example = st.selectbox(
"Choose a pre-defined question:",
example_queries
)
user_query = st.text_area(
"Or type your question here:",
value="" if selected_example == "Select an example query..." else selected_example,
height=100,
placeholder="e.g., What is the dosage for Azithromycin?"
)
# --- Chatbot Interaction ---
if st.button("Get Answer", type="primary"):
if not assets_loaded:
log_message("error", "Application assets failed to verify. Please check the console for errors.")
elif not user_query.strip():
log_message("warning", "Please enter a question or select an example query.")
else:
# Check for Groq API Key
if "GROQ_API_KEY" not in os.environ:
log_message("error", "GROQ_API_KEY environment variable not set. Please set it to use the chatbot.")
else:
with st.spinner("Thinking... Retrieving and generating answer..."):
try:
# 1. Preprocess Query
# Query_processing.py should handle its own spacy model loading.
(intent, sub_intent), entities = preprocess_query(user_query)
log_message("info", f"Detected Intent: {intent}, Sub-Intent: {sub_intent}, Entities: {entities}")
# 2. Retrieve Chunks
# Retrieval_averagedQP is expected to load FAISS index and vectors internally.
chunks = Retrieval_averagedQP(user_query, intent, entities)
if not chunks.empty: # Check if chunks DataFrame is not empty
# 3. Generate Answer
answer = answer_generation(user_query, chunks)
log_message("info", f"Generated Answer Content: {answer[:200]}...") # Log first 200 chars
if not answer.strip(): # Check if answer is empty after stripping whitespace
log_message("warning", "Answer generation returned an empty response.")
st.warning("Could not generate a clear answer for this query. Please try rephrasing.")
else:
log_message("success", "Answer generated successfully!")
st.success("Answer:") # Display success message
st.write(answer) # This prints the answer in the main area
with st.expander("See Retrieved Chunks (for debugging/transparency)"):
st.write("Top 3 Retrieved Chunks:")
for i, chunk in enumerate(chunks.head(3).to_dict(orient='records')): # Display top 3 for brevity
st.write(f"**Chunk {i+1}:**")
st.json(chunk) # Use st.json for better display of dict
st.markdown("---")
else:
log_message("warning", "No relevant information found for your query. Please try rephrasing.")
except Exception as e:
log_message("error", f"An error occurred while processing your request: {e}")
st.info("Please try again or rephrase your question.") # User-friendly message
st.markdown("---")
# --- About Section ---
st.header("About This Project")
with st.expander("Learn More About the Medical Drug QA Chatbot"):
st.markdown(
"""
This project implements a **Retrieval-Based Question Answering (QA) system** designed to answer user queries
about medical drugs. It aims to provide accurate and factually grounded information by retrieving relevant
details from a verified database.
### Purpose
With the rapid increase in approved medications, ensuring factual accuracy in medical information is critical.
Traditional Large Language Models (LLMs) can sometimes "hallucinate" or provide untraceable answers.
Our system addresses this by grounding its responses in a curated database, ensuring factual consistency
and increasing user trust.
### Methodology
The system follows a multi-stage pipeline:
1. **Data Acquisition & Preprocessing:** Information about 2,755 drugs was web-scraped from MayoClinic.com,
cleaned, and flattened into a structured CSV dataset.
2. **Embedding Generation:** The dataset content is embedded using the **MiniLM-V6** model, and indexed
with **FAISS** (Facebook AI Similarity Search) for efficient similarity-based retrieval.
3. **Query Processing:** User queries undergo **intent and sub-intent classification** (e.g., identifying if
the user is asking about "side effects" or "dosage") and **Named Entity Recognition (NER)** using SciSpaCy
to improve retrieval precision.
4. **Retrieval Pipeline:**
* **Query Vectorization:** The user query is vectorized using MiniLM-V6, incorporating weighted intent vectors.
* **Initial Retrieval:** FAISS is used to retrieve the top 10 most similar document chunks.
* **Reranking:** The retrieved chunks are then reranked using **Sentence-BioBERT**, which excels at
capturing biomedical contexts, significantly improving the relevance of the final selected documents.
5. **Answer Generation:** The top 3 reranked context chunks, along with the original query, are fed to the
**LLaMA-4 model** (via Groq API). The LLM is prompted to generate an answer *strictly based on the
provided context*, minimizing hallucination.
### Models Used
* **MiniLM-L6-v2:** For FAISS-based vector retrieval.
* **Sentence-BioBERT:** For reranking candidate chunks.
* **LLaMA-4:** For final answer generation (accessed via Groq API).
* **SciSpaCy:** For Named Entity Recognition and intent classification.
This project was developed by Niranjan Sathish and Hariharan Chandrasekar.
"""
)
# --- Repository Link Button (Placeholder) ---
st.markdown("---")
st.write("### Project Resources")
st.markdown(
"""
Once the project is hosted, you'll find links to the repository or Hugging Face Space here.
"""
)
# Placeholder for the actual button. You can uncomment and update this later.
# if st.button("Go to GitHub Repository"):
# st.markdown("[GitHub Repository Link](YOUR_GITHUB_REPO_URL_HERE)")
# if st.button("Go to Hugging Face Space"):
# st.markdown("[Hugging Face Space Link](YOUR_HUGGING_FACE_SPACE_URL_HERE)")
# --- Application Logs Section ---
st.markdown("---")
st.header("Application Logs")
with st.expander("Show/Hide Logs"):
if app_messages:
for msg_type, msg_content in app_messages:
if msg_type == "info":
st.info(msg_content)
elif msg_type == "success":
st.success(msg_content)
elif msg_type == "warning":
st.warning(msg_content)
elif msg_type == "error":
st.error(msg_content)
else:
st.write("No application messages yet.")