#!/bin/bash # Setup RAG system - One command to rule them all # Usage: bash scripts/setup_rag.sh set -e # Exit on error # Colors GREEN='\033[0;32m' YELLOW='\033[1;33m' RED='\033[0;31m' BLUE='\033[0;34m' NC='\033[0m' # No Color echo -e "${BLUE}" echo "╔════════════════════════════════════════════════════════════╗" echo "║ 🏥 HeoCare RAG System Setup (HuggingFace) ║" echo "╚════════════════════════════════════════════════════════════╝" echo -e "${NC}" # 0. Cleanup old files and databases echo -e "${BLUE}🧹 Cleaning up old files and databases...${NC}" # Remove old PDF/MD files from data_mining (if any) if find data_mining -name "*.pdf" -o -name "*.md" ! -name "README.md" 2>/dev/null | grep -q .; then echo -e "${YELLOW} Removing old PDF/MD files...${NC}" find data_mining -name "*.pdf" -type f -delete 2>/dev/null || true find data_mining -name "*.md" -type f ! -name "README.md" -delete 2>/dev/null || true echo -e "${GREEN} ✅ Old documents removed${NC}" fi # Clear temporary datasets and output folders if [ -d "data_mining/datasets" ] || [ -d "data_mining/output" ]; then echo -e "${YELLOW} Clearing temporary folders...${NC}" rm -rf data_mining/datasets 2>/dev/null || true rm -rf data_mining/output 2>/dev/null || true echo -e "${GREEN} ✅ Temporary folders cleared${NC}" fi # Clear old vector stores (will be regenerated) if [ -d "rag/vector_store" ]; then echo -e "${YELLOW} Clearing old vector stores...${NC}" rm -rf rag/vector_store/* 2>/dev/null || true echo -e "${GREEN} ✅ Old vector stores cleared${NC}" fi # Clear Python cache if [ -d "__pycache__" ] || find . -type d -name "__pycache__" 2>/dev/null | grep -q .; then echo -e "${YELLOW} Clearing Python cache...${NC}" find . -type d -name "__pycache__" -exec rm -rf {} + 2>/dev/null || true find . -type f -name "*.pyc" -delete 2>/dev/null || true echo -e "${GREEN} ✅ Python cache cleared${NC}" fi echo -e "${GREEN}✅ Cleanup complete!${NC}" # 1. Check Python echo -e "${BLUE}🐍 Checking Python...${NC}" if ! command -v python3 &> /dev/null; then echo -e "${RED}❌ Python3 not found!${NC}" echo "Please install Python 3.8 or higher" exit 1 fi PYTHON_VERSION=$(python3 --version) echo -e "${GREEN}✅ ${PYTHON_VERSION}${NC}" # 2. Check pip echo -e "\n${BLUE}📦 Checking pip...${NC}" if ! command -v pip3 &> /dev/null && ! command -v pip &> /dev/null; then echo -e "${RED}❌ pip not found!${NC}" exit 1 fi echo -e "${GREEN}✅ pip found${NC}" # 3. Install dependencies echo -e "\n${BLUE}📦 Installing dependencies...${NC}" echo -e "${YELLOW}This may take a few minutes...${NC}" # Check if requirements.txt exists if [ -f "requirements.txt" ]; then pip3 install -q -r requirements.txt || pip install -q -r requirements.txt echo -e "${GREEN}✅ Dependencies installed from requirements.txt${NC}" else echo -e "${YELLOW}⚠️ requirements.txt not found, installing core packages...${NC}" pip3 install -q langchain langchain-chroma langchain-huggingface chromadb tqdm beautifulsoup4 requests || \ pip install -q langchain langchain-chroma langchain-huggingface chromadb tqdm beautifulsoup4 requests echo -e "${GREEN}✅ Core dependencies installed${NC}" fi # 4. Create directories echo -e "\n${BLUE}📁 Creating directories...${NC}" mkdir -p rag/vector_store mkdir -p data_mining/{datasets,output} mkdir -p chroma_db echo -e "${GREEN}✅ Directories created${NC}" # 5. Setup ViMedical Vietnamese Disease Dataset echo -e "\n${BLUE}🏥 Setting up ViMedical Vietnamese Disease Dataset...${NC}" echo -e "${YELLOW}This will download and process 603 Vietnamese diseases...${NC}" # Check if already exists if [ -d "rag/vector_store/medical_diseases" ]; then echo -e "${YELLOW}⚠️ ViMedical database already exists, skipping...${NC}" else # Create temp directory mkdir -p data_mining/datasets mkdir -p data_mining/output # Run ViMedical setup python3 data_mining/mining_vimedical.py || python data_mining/mining_vimedical.py if [ $? -eq 0 ]; then # Move to RAG directory mkdir -p rag/vector_store mv data_mining/output/medical_chroma rag/vector_store/medical_diseases echo -e "${GREEN}✅ ViMedical dataset ready (603 diseases)${NC}" else echo -e "${YELLOW}⚠️ ViMedical setup failed, continuing...${NC}" fi # Cleanup rm -rf data_mining/datasets rm -rf data_mining/output fi # 6. Setup MentalChat16K Mental Health Dataset echo -e "\n${BLUE}🧠 Setting up MentalChat16K Mental Health Dataset...${NC}" echo -e "${YELLOW}This will download and process 16K mental health conversations...${NC}" # Check if already exists if [ -d "rag/vector_store/mental_health" ]; then echo -e "${YELLOW}⚠️ Mental Health database already exists, skipping...${NC}" else # Create temp directory mkdir -p data_mining/datasets mkdir -p data_mining/output # Run MentalChat setup python3 data_mining/mining_mentalchat.py || python data_mining/mining_mentalchat.py if [ $? -eq 0 ]; then # Move to RAG directory mkdir -p rag/vector_store mv data_mining/output/mental_health_chroma rag/vector_store/mental_health echo -e "${GREEN}✅ Mental Health dataset ready (16K conversations)${NC}" else echo -e "${YELLOW}⚠️ Mental Health setup failed, continuing...${NC}" fi # Cleanup rm -rf data_mining/datasets rm -rf data_mining/output fi # 7. Setup Nutrition Dataset (Dietary Profiles) echo -e "\n${BLUE}🥗 Setting up Nutrition Dataset (Dietary Profiles)...${NC}" echo -e "${YELLOW}This will download 50 dietary profiles...${NC}" if [ -d "rag/vector_store/nutrition" ]; then echo -e "${YELLOW}⚠️ Nutrition database already exists, skipping...${NC}" else mkdir -p data_mining/datasets data_mining/output python3 data_mining/mining_nutrition.py || python data_mining/mining_nutrition.py if [ $? -eq 0 ]; then mkdir -p rag/vector_store mv data_mining/output/nutrition_chroma rag/vector_store/nutrition echo -e "${GREEN}✅ Nutrition profiles ready (50 profiles)${NC}" else echo -e "${YELLOW}⚠️ Nutrition setup failed, continuing...${NC}" fi rm -rf data_mining/datasets data_mining/output fi # 7b. Setup Vietnamese Food Nutrition Database echo -e "\n${BLUE}🍜 Setting up Vietnamese Food Nutrition Database...${NC}" echo -e "${YELLOW}This will create 73 Vietnamese foods with nutrition facts...${NC}" if [ -d "rag/vector_store/vietnamese_nutrition" ]; then echo -e "${YELLOW}⚠️ Vietnamese nutrition database already exists, skipping...${NC}" else mkdir -p data_mining/datasets data_mining/output python3 data_mining/mining_vietnamese_nutrition.py || python data_mining/mining_vietnamese_nutrition.py if [ $? -eq 0 ]; then mkdir -p rag/vector_store mv data_mining/output/vietnamese_nutrition_chroma rag/vector_store/vietnamese_nutrition echo -e "${GREEN}✅ Vietnamese food nutrition ready (73 foods)${NC}" else echo -e "${YELLOW}⚠️ Vietnamese nutrition setup failed, continuing...${NC}" fi rm -rf data_mining/datasets data_mining/output fi # 8. Setup Fitness Dataset echo -e "\n${BLUE}💪 Setting up Fitness Dataset...${NC}" echo -e "${YELLOW}This will download and process gym exercises...${NC}" if [ -d "rag/vector_store/fitness" ]; then echo -e "${YELLOW}⚠️ Fitness database already exists, skipping...${NC}" else mkdir -p data_mining/datasets data_mining/output python3 data_mining/mining_fitness.py || python data_mining/mining_fitness.py if [ $? -eq 0 ]; then mkdir -p rag/vector_store mv data_mining/output/fitness_chroma rag/vector_store/fitness echo -e "${GREEN}✅ Fitness dataset ready${NC}" else echo -e "${YELLOW}⚠️ Fitness setup failed, continuing...${NC}" fi rm -rf data_mining/datasets data_mining/output fi # 9. Setup COVID-19 Dataset (DEPRECATED - Skipped) echo -e "\n${BLUE}🦠 COVID-19 Dataset...${NC}" echo -e "${YELLOW}⏭️ Skipping (dataset deprecated, already have Medical Q&A)${NC}" # 10. Setup Vietnamese Medical Q&A Dataset echo -e "\n${BLUE}💬 Setting up Vietnamese Medical Q&A Dataset...${NC}" echo -e "${YELLOW}This will download and process 9.3K medical Q&A pairs from HuggingFace...${NC}" if [ -d "rag/vector_store/symptom_qa" ] && [ -d "rag/vector_store/general_health_qa" ]; then echo -e "${YELLOW}⚠️ Medical Q&A databases already exist, skipping...${NC}" else mkdir -p data_mining/datasets data_mining/output python3 data_mining/mining_medical_qa.py || python data_mining/mining_medical_qa.py if [ $? -eq 0 ]; then mkdir -p rag/vector_store mv data_mining/output/symptom_qa_chroma rag/vector_store/symptom_qa mv data_mining/output/general_health_qa_chroma rag/vector_store/general_health_qa echo -e "${GREEN}✅ Medical Q&A datasets ready (Symptom + General Health)${NC}" else echo -e "${YELLOW}⚠️ Medical Q&A setup failed, continuing...${NC}" fi rm -rf data_mining/datasets data_mining/output fi # 11. Verify RAG echo -e "\n${BLUE}✅ Verifying RAG system...${NC}" python3 scripts/check_rag_status.py 2>/dev/null || python scripts/check_rag_status.py 2>/dev/null || echo "⚠️ Verification skipped" # 12. Generate Training Data (DISABLED - Not needed without fine-tuning) # echo -e "\n${BLUE}🤖 Generating synthetic training data...${NC}" # echo -e "${YELLOW}This will create ~200 conversations for fine-tuning...${NC}" # # if [ -d "fine_tuning/training_data" ] && [ "$(ls -A fine_tuning/training_data 2>/dev/null)" ]; then # echo -e "${YELLOW}⚠️ Training data already exists, skipping generation...${NC}" # else # python3 scripts/generate_training_data.py || python scripts/generate_training_data.py # if [ $? -eq 0 ]; then # echo -e "${GREEN}✅ Training data generated!${NC}" # else # echo -e "${YELLOW}⚠️ Training data generation failed, continuing...${NC}" # fi # fi # 13. Fine-tune Models (DISABLED - Custom API doesn't support fine-tuning) # Fine-tuning requires OpenAI official API, which costs money and is not necessary # The app works well with base model + RAG without fine-tuning # # echo -e "\n${BLUE}🎓 Fine-tuning agents...${NC}" # echo -e "${YELLOW}This will fine-tune all agents with synthetic data (takes 30-60 min, costs ~\$2)${NC}" # echo -e "${YELLOW}Do you want to fine-tune now? (y/N)${NC}" # read -t 10 -n 1 -r FINETUNE_CHOICE || FINETUNE_CHOICE="n" # echo # # if [[ $FINETUNE_CHOICE =~ ^[Yy]$ ]]; then # echo -e "${BLUE}🚀 Starting fine-tuning...${NC}" # python3 scripts/auto_finetune.py || python scripts/auto_finetune.py # if [ $? -eq 0 ]; then # echo -e "${GREEN}✅ Fine-tuning complete!${NC}" # else # echo -e "${YELLOW}⚠️ Fine-tuning failed, check errors above${NC}" # fi # else # echo -e "${YELLOW}⏭️ Skipping fine-tuning (you can run it later with: python scripts/auto_finetune.py)${NC}" # fi echo -e "\n${YELLOW}ℹ️ Training data generation and fine-tuning are disabled${NC}" echo -e "${YELLOW} Reason: Custom API doesn't support fine-tuning (404 error)${NC}" echo -e "${YELLOW} App works well with base model + RAG without fine-tuning${NC}" # Done echo -e "\n${GREEN}" echo "╔════════════════════════════════════════════════════════════╗" echo "║ 🎉 Setup Complete! ║" echo "╚════════════════════════════════════════════════════════════╝" echo -e "${NC}" echo -e "${BLUE}📊 What was set up:${NC}" echo " ✅ RAG databases (6 specialized databases, ~160 MB)" echo " - ViMedical Diseases (603 diseases)" echo " - Mental Health (16K conversations)" echo " - Nutrition Plans" echo " - Vietnamese Food (73 items)" echo " - Fitness Exercises (1.66K)" echo " - Medical Q&A (9.3K pairs)" echo "" echo -e "${BLUE}🚀 Next steps:${NC}" echo " 1. python app.py" echo " 2. Open http://localhost:7860 in your browser" echo "" echo -e "${BLUE}💡 Tips:${NC}" echo " - Check RAG status: python scripts/check_rag_status.py" echo " - App works with base model + RAG (no fine-tuning needed)" echo ""