Spaces:

NiranjanSathish
/

DrugBot-Retrieval_Based_QA_Chatbot

Runtime error

App Files Files Community

DrugBot-Retrieval_Based_QA_Chatbot / app.py

NiranjanSathish

Upload 2 files

965e103 verified 3 months ago

raw

history blame contribute delete

15.8 kB

	import dotenv
	# Load environment variables from .env file
	dotenv.load_dotenv()

	import streamlit as st
	import os
	import sys
	import pickle
	import numpy as np
	import spacy # Added to explicitly check for spacy model loading

	# --- Custom CSS for reduced whitespace and colors ---
	st.markdown(
	"""
	<style>
	/* Reduce top padding for the main Streamlit app container */
	.stApp {
	padding-top: 0px; /* Reduced this value to minimize whitespace at the very top */
	padding-bottom: 20px;
	}

	/* Set a subtle background color for the entire page */
	body {
	background-color: #f0f8ff; /* AliceBlue - a very light blue */
	color: #333333; /* Dark gray for text */
	}

	/* Style for headers */
	h1, h2, h3, h4, h5, h6 {
	color: #1a5276; /* Darker blue for headings */
	}

	/* Style for buttons */
	.stButton>button {
	background-color: #28a745; /* Green for primary button */
	color: white;
	border-radius: 8px;
	padding: 10px 20px;
	border: none;
	box-shadow: 2px 2px 5px rgba(0,0,0,0.2);
	transition: background-color 0.3s ease;
	}
	.stButton>button:hover {
	background-color: #218838; /* Darker green on hover */
	}

	/* Style for text areas and select boxes */
	.stTextArea textarea, .stSelectbox [data-testid="stSelectbox"] {
	border-radius: 8px;
	border: 1px solid #cccccc;
	}

	/* Style for info, success, warning, error boxes */
	.stAlert {
	border-radius: 8px;
	}

	</style>
	""",
	unsafe_allow_html=True
	)

	# --- Global message log ---
	# This list will store messages to be displayed in the log expander
	app_messages = []

	def log_message(type, message):
	"""
	Helper function to append messages to the log list and display them prominently
	based on their type.
	"""
	app_messages.append((type, message))
	if type == "error":
	st.error(message)


	# Add the 'Scripts' directory to the Python path
	# This allows importing modules like Query_processing, Retrieval, and Answer_Generation
	script_dir = os.path.join(os.path.dirname(__file__), 'Scripts')
	log_message("info", f"Attempting to add '{script_dir}' to Python path.")
	if script_dir not in sys.path:
	sys.path.append(script_dir)
	log_message("info", f"'{script_dir}' added to sys.path.")
	else:
	log_message("info", f"'{script_dir}' already in sys.path.")

	# --- Debugging: Check if script files exist ---
	script_files_to_check = {
	"Query_processing.py": False,
	"Retrieval.py": False,
	"Answer_Generation.py": False
	}
	all_scripts_found = True

	for script_name in script_files_to_check:
	script_path = os.path.join(script_dir, script_name)
	if os.path.exists(script_path):
	script_files_to_check[script_name] = True
	else:
	all_scripts_found = False
	log_message("error", f"Error: Script file not found at expected path: {script_path}")

	if not all_scripts_found:
	log_message("error", "One or more essential script files are missing from the 'Scripts' directory. "
	"Please ensure your project structure is correct.")
	st.stop() # Stop execution if critical files are missing

	# Import your core logic modules
	try:
	from Query_processing import preprocess_query
	from Retrieval import Retrieval_averagedQP
	from Answer_Generation import answer_generation
	log_message("success", "Core modules imported successfully!")
	except ImportError as e:
	log_message("error", f"Error importing core modules. Make sure 'Scripts' directory is correctly structured and contains "
	f"Query_processing.py, Retrieval.py, and Answer_Generation.py. Error: {e}")
	st.stop()

	# --- Configuration ---
	# Set page configuration for a wider layout
	st.set_page_config(layout="wide", page_title="Drugbot!", page_icon="💊")

	# Define paths to your data and vectors
	# These paths are relative to the app.py location
	DATASET_PATH = os.path.join(os.path.dirname(__file__), 'Datasets', 'flattened_drug_dataset_cleaned.csv')
	VECTORS_DIR = os.path.join(os.path.dirname(__file__), 'Vectors')
	FAISS_INDEX_PATH = os.path.join(VECTORS_DIR, 'faiss_index.idx')
	DOC_METADATA_PATH = os.path.join(VECTORS_DIR, 'doc_metadata.pkl')
	DOC_VECTORS_PATH = os.path.join(VECTORS_DIR, 'doc_vectors.npy')

	# --- Cached Resources ---
	# Use st.cache_resource to load heavy models and data only once
	@st.cache_resource
	def load_all_assets():
	"""
	Verifies the existence of necessary files and attempts to load core NLP models.
	This function will be run only once across all user sessions.
	"""
	with st.spinner("Verifying medical knowledge base and models... This might take a moment."):
	try:
	# 1. Check for presence of FAISS and embedding files
	if not os.path.exists(FAISS_INDEX_PATH):
	log_message("error", f"Missing FAISS index file: {FAISS_INDEX_PATH}")
	return False
	if not os.path.exists(DOC_METADATA_PATH):
	log_message("error", f"Missing document metadata file: {DOC_METADATA_PATH}")
	return False
	if not os.path.exists(DOC_VECTORS_PATH):
	log_message("error", f"Missing document vectors file: {DOC_VECTORS_PATH}")
	return False

	# 2. Attempt to load the SciSpaCy model (if Query_processing doesn't handle it globally)
	# This is a common point of failure, so we'll explicitly check.
	# Assuming 'en_core_sci_md' is the model name.
	try:
	# If spacy.load() is called multiple times, it might cause issues.
	# It's better if Query_processing handles its own model loading once.
	# This check is just to ensure the model is loadable.
	# nlp = spacy.load("en_core_sci_md")
	# del nlp # Release the model if it's not needed globally here
	log_message("info", "SciSpaCy 'en_core_sci_md' model is expected to be loaded by Query_processing.")
	except OSError:
	log_message("error", "SciSpaCy 'en_core_sci_md' model not found or linked. "
	"Please ensure it's installed correctly (e.g., `pip install https://s3-us-west-2.amazonaws.com/ai2-s2-scispacy/releases/v0.5.4/en_core_sci_md-0.5.4.tar.gz`).")
	return False
	except Exception as e:
	log_message("error", f"An unexpected error occurred while checking SciSpaCy model: {e}")
	return False

	log_message("success", "Medical knowledge base files verified. Models will be loaded as needed.")
	return True # Indicate successful verification
	except Exception as e:
	log_message("error", f"Failed to verify assets. Please ensure all data and vector files are in their correct paths. Error: {e}")
	return False

	# Load all assets at the start of the application
	assets_loaded = load_all_assets()

	# --- Title and Header ---
	st.title("💊 DrugBot")
	st.markdown("---")

	# --- Instructions ---
	# This section is already placed directly after the title and horizontal rule.
	st.header("How to Use:")
	st.write(
	"""
	Welcome to DrugBot - Retrieval based Medical Drug QA Chatbot! You can ask questions about medical drugs, and I will retrieve
	information from a verified database to provide accurate answers.

	1. Select an example query from the dropdown or type your own question in the text area below.
	2. Click the "Get Answer" button.
	3. Wait for the chatbot to process your query and generate an answer.
	"""
	)
	st.markdown("---")

	# --- Example Queries ---
	st.header("Try These Examples:")
	example_queries = [
	"Select an example query...",
	"What is the dosage for Azithromycin?",
	"What are the side effects of Ibuprofen?",
	"How should I take Amoxicillin?",
	"What are the precautions for Warfarin?",
	"What are the drug interactions for Metformin?",
	"What is Paracetamol used for?",
	"Can pregnant women take Aspirin?",
	"How does Prednisone work?",
	"What is the recommended dose for children for Tylenol?"
	]

	selected_example = st.selectbox(
	"Choose a pre-defined question:",
	example_queries
	)

	user_query = st.text_area(
	"Or type your question here:",
	value="" if selected_example == "Select an example query..." else selected_example,
	height=100,
	placeholder="e.g., What is the dosage for Azithromycin?"
	)

	# --- Chatbot Interaction ---
	if st.button("Get Answer", type="primary"):
	if not assets_loaded:
	log_message("error", "Application assets failed to verify. Please check the console for errors.")
	elif not user_query.strip():
	log_message("warning", "Please enter a question or select an example query.")
	else:
	# Check for Groq API Key
	if "GROQ_API_KEY" not in os.environ:
	log_message("error", "GROQ_API_KEY environment variable not set. Please set it to use the chatbot.")
	else:
	with st.spinner("Thinking... Retrieving and generating answer..."):
	try:
	# 1. Preprocess Query
	# Query_processing.py should handle its own spacy model loading.
	(intent, sub_intent), entities = preprocess_query(user_query)
	log_message("info", f"Detected Intent: {intent}, Sub-Intent: {sub_intent}, Entities: {entities}")

	# 2. Retrieve Chunks
	# Retrieval_averagedQP is expected to load FAISS index and vectors internally.
	chunks = Retrieval_averagedQP(user_query, intent, entities)

	if not chunks.empty: # Check if chunks DataFrame is not empty
	# 3. Generate Answer
	answer = answer_generation(user_query, chunks)

	log_message("info", f"Generated Answer Content: {answer[:200]}...") # Log first 200 chars
	if not answer.strip(): # Check if answer is empty after stripping whitespace
	log_message("warning", "Answer generation returned an empty response.")
	st.warning("Could not generate a clear answer for this query. Please try rephrasing.")
	else:
	log_message("success", "Answer generated successfully!")
	st.success("Answer:") # Display success message
	st.write(answer) # This prints the answer in the main area

	with st.expander("See Retrieved Chunks (for debugging/transparency)"):
	st.write("Top 3 Retrieved Chunks:")
	for i, chunk in enumerate(chunks.head(3).to_dict(orient='records')): # Display top 3 for brevity
	st.write(f"Chunk {i+1}:")
	st.json(chunk) # Use st.json for better display of dict
	st.markdown("---")
	else:
	log_message("warning", "No relevant information found for your query. Please try rephrasing.")


	except Exception as e:
	log_message("error", f"An error occurred while processing your request: {e}")
	st.info("Please try again or rephrase your question.") # User-friendly message

	st.markdown("---")

	# --- About Section ---
	st.header("About This Project")
	with st.expander("Learn More About the Medical Drug QA Chatbot"):
	st.markdown(
	"""
	This project implements a Retrieval-Based Question Answering (QA) system designed to answer user queries
	about medical drugs. It aims to provide accurate and factually grounded information by retrieving relevant
	details from a verified database.

	### Purpose
	With the rapid increase in approved medications, ensuring factual accuracy in medical information is critical.
	Traditional Large Language Models (LLMs) can sometimes "hallucinate" or provide untraceable answers.
	Our system addresses this by grounding its responses in a curated database, ensuring factual consistency
	and increasing user trust.

	### Methodology
	The system follows a multi-stage pipeline:
	1. Data Acquisition & Preprocessing: Information about 2,755 drugs was web-scraped from MayoClinic.com,
	cleaned, and flattened into a structured CSV dataset.
	2. Embedding Generation: The dataset content is embedded using the MiniLM-V6 model, and indexed
	with FAISS (Facebook AI Similarity Search) for efficient similarity-based retrieval.
	3. Query Processing: User queries undergo intent and sub-intent classification (e.g., identifying if
	the user is asking about "side effects" or "dosage") and Named Entity Recognition (NER) using SciSpaCy
	to improve retrieval precision.
	4. Retrieval Pipeline:
	* Query Vectorization: The user query is vectorized using MiniLM-V6, incorporating weighted intent vectors.
	* Initial Retrieval: FAISS is used to retrieve the top 10 most similar document chunks.
	* Reranking: The retrieved chunks are then reranked using Sentence-BioBERT, which excels at
	capturing biomedical contexts, significantly improving the relevance of the final selected documents.
	5. Answer Generation: The top 3 reranked context chunks, along with the original query, are fed to the
	LLaMA-4 model (via Groq API). The LLM is prompted to generate an answer *strictly based on the
	provided context*, minimizing hallucination.

	### Models Used
	* MiniLM-L6-v2: For FAISS-based vector retrieval.
	* Sentence-BioBERT: For reranking candidate chunks.
	* LLaMA-4: For final answer generation (accessed via Groq API).
	* SciSpaCy: For Named Entity Recognition and intent classification.

	This project was developed by Niranjan Sathish and Hariharan Chandrasekar.
	"""
	)

	# --- Repository Link Button (Placeholder) ---
	st.markdown("---")
	st.write("### Project Resources")
	st.markdown(
	"""
	Once the project is hosted, you'll find links to the repository or Hugging Face Space here.
	"""
	)
	# Placeholder for the actual button. You can uncomment and update this later.
	# if st.button("Go to GitHub Repository"):
	# st.markdown("[GitHub Repository Link](YOUR_GITHUB_REPO_URL_HERE)")
	# if st.button("Go to Hugging Face Space"):
	# st.markdown("[Hugging Face Space Link](YOUR_HUGGING_FACE_SPACE_URL_HERE)")


	# --- Application Logs Section ---
	st.markdown("---")
	st.header("Application Logs")
	with st.expander("Show/Hide Logs"):
	if app_messages:
	for msg_type, msg_content in app_messages:
	if msg_type == "info":
	st.info(msg_content)
	elif msg_type == "success":
	st.success(msg_content)
	elif msg_type == "warning":
	st.warning(msg_content)
	elif msg_type == "error":
	st.error(msg_content)
	else:
	st.write("No application messages yet.")