Spaces:

A7m0d
/

Autism_QA

Runtime error

File size: 48,231 Bytes

712579e

import asyncio
import base64
import os
import time
from google.genai import types
from google.genai.types import (
    LiveConnectConfig,
    SpeechConfig,
    VoiceConfig,
    PrebuiltVoiceConfig,
    Content,
    Part,)
    
import os 
from pipeQuery import process_query
import re
from pipeQuery import clean_pipeline_result
import numpy as np
from dotenv import load_dotenv
from fastrtc import wait_for_item
import google.genai as genai
import asyncio
import base64
import os
from typing import AsyncGenerator, Literal
import gradio as gr
import numpy as np
from fastrtc import (
    AsyncStreamHandler,
    wait_for_item,)

import google.generativeai as genai
from google.genai.types import (
    LiveConnectConfig,
    PrebuiltVoiceConfig,
    SpeechConfig,
    VoiceConfig,)

from clients import gemini_client
import soundfile as sf
import io
import collections
import time


## Load Custom Logger
from logger.custom_logger import CustomLoggerTracker
custom_log = CustomLoggerTracker()
logger = custom_log.get_logger("audio_utils")
 ## Load APIS from dotenv
load_dotenv()

## load config
from configs import load_yaml_config
config = load_yaml_config("config.yaml")


def encode_audio(data: np.ndarray) -> dict:
    return {
        "mime_type": "audio/pcm",
        "data": base64.b64encode(data.tobytes()).decode("UTF-8"),}


def encode_audio2(data: np.ndarray) -> bytes:
    return data.tobytes()


def numpy_array_to_wav_bytes(audio_array, sample_rate=16000):
    buffer = io.BytesIO()
    sf.write(buffer, audio_array, sample_rate, format='WAV')
    return buffer.getvalue()


def numpy_array_to_wav_bytes(audio_array, sample_rate=16000):
    buffer = io.BytesIO()
    sf.write(buffer, audio_array, sample_rate, format='WAV')
    buffer.seek(0)
    return buffer.read()


class GeminiHandler(AsyncStreamHandler):
    def __init__(

        self,

        expected_layout: Literal["mono"] = "mono",

        output_sample_rate: int = 24000,

        prompt_dict: dict = {"prompt": "PHQ-9"},

    ) -> None:
        super().__init__(
            expected_layout,
            output_sample_rate,
            input_sample_rate=16000,
        )
        self.input_queue: asyncio.Queue = asyncio.Queue()
        self.output_queue: asyncio.Queue = asyncio.Queue()
        self.quit: asyncio.Event = asyncio.Event()
        self.is_active: bool = False
        self.prompt_dict = prompt_dict
        # Load from config if available, otherwise use defaults
        try:
            self.model = config["audio"]["model_live"]
            self.t2t_model = config["audio"]["tts_model"]
            self.s2t_model = config["audio"]["stt_model"]
            self.VAD_RATE = config["audio"]["VAD_RATE"]
            self.VAD_FRAME_MS = config["audio"]["VAD_FRAME_MS"]
            padding_ms = config["audio"]["padding_ms"]
            self.vad_ratio = config["audio"]["vad_ratio"]
        except (KeyError, NameError):
            # Fallback defaults if config not available
            self.model = "gemini-2.5-flash-preview-tts"
            self.t2t_model = "gemini-2.0-flash-exp"
            self.s2t_model = "gemini-2.0-flash-exp"
            self.VAD_RATE = 16000
            self.VAD_FRAME_MS = 30
            padding_ms = 300
            self.vad_ratio = 0.9

        # VAD Initialization
        try:
            import webrtcvad
            self.vad = webrtcvad.Vad(3)
            self.vad_available = True
        except ImportError:
            logger.warning("webrtcvad not available, VAD disabled")
            self.vad_available = False

        self.VAD_FRAME_SAMPLES = int(self.VAD_RATE * (self.VAD_FRAME_MS / 1000.0))
        self.VAD_FRAME_BYTES = self.VAD_FRAME_SAMPLES * 2
        self.vad_padding_frames = padding_ms // self.VAD_FRAME_MS
        self.vad_ring_buffer = collections.deque(maxlen=self.vad_padding_frames)
        self.vad_triggered = False
        self.wav_data = bytearray()
        self.internal_buffer = bytearray()
        self.end_of_speech_time: float | None = None
        self.first_latency_calculated: bool = False

    def copy(self) -> "GeminiHandler":
        return GeminiHandler(
            expected_layout="mono",
            output_sample_rate=self.output_sample_rate,
            prompt_dict=self.prompt_dict,
        )

    def stop(self) -> None:
        logger.info("Stopping GeminiHandler...")
        self.quit.set()
        self.is_active = False

    def shutdown(self) -> None:
        self.stop()

    def t2t_with_rag(self, text: str) -> str:
        try:
            response = process_query(text)
            if isinstance(response, tuple):
                result = clean_pipeline_result(response[0] if response[0] else response[1])
            else:
                result = clean_pipeline_result(str(response))
            logger.info(f"RAG response generated: {result[:100]}...")
            return result
        except Exception as e:
            logger.error(f"Error in RAG processing: {e}")
            try:
                response = self.chat.send_message(text)
                return response.text
            except Exception as fallback_error:
                logger.error(f"Fallback Gemini also failed: {fallback_error}")
                return "I'm sorry, I'm having trouble processing your request right now."


    def s2t(self, audio) -> str:
        try:
            response = self.s2t_client.models.generate_content(
                model=self.s2t_model,
                contents=[
                    types.Part.from_bytes(data=audio, mime_type='audio/wav'),
                    'Generate a transcript of the speech.'
                ]
            )
            return response.text.strip()
        except Exception as e:
            logger.error(f"STT error: {e}")
            return ""

    async def start_up(self):
        """Initialize the handler with proper error handling"""
        try:
            self.is_active = True
            self.t2t_bool = True  # Enable RAG processing
            
            # Initialize clients with error handling
            try:
                self.t2t_client = gemini_client()
                self.s2t_client = gemini_client()
                self.t2s_client = gemini_client()
            except Exception as e:
                logger.error(f"Failed to initialize Gemini clients: {e}")
                return

            # Chat configuration
            sys_instruction = """You are Wisal, an AI assistant developed by Compumacy AI, specialized in Autism Spectrum Disorder (ASD).

                               Your sole purpose is to provide helpful, respectful, and easy-to-understand answers about Autism.

                               Always be clear, non-judgmental, and supportive."""

            try:
                chat_config = types.GenerateContentConfig(system_instruction=sys_instruction)
                self.chat = self.t2t_client.chats.create(model=self.t2t_model, config=chat_config)
            except Exception as e:
                logger.error(f"Failed to create chat: {e}")
                return

            # Live connect configuration
            voice_name = "Puck"
            try:
                config = LiveConnectConfig(
                    response_modalities=["AUDIO"],
                    speech_config=SpeechConfig(
                        voice_config=VoiceConfig(
                            prebuilt_voice_config=PrebuiltVoiceConfig(voice_name=voice_name)
                        )
                    ),
                    system_instruction=Content(parts=[Part.from_text(text=sys_instruction)])
                )
            except Exception as e:
                logger.error(f"Failed to create live config: {e}")
                return

            # Main processing loop with stop capability
            try:
                async with self.t2s_client.aio.live.connect(model=self.model, config=config) as session:
                    async for text_from_user in self.stream():
                        if self.quit.is_set():
                            break
                            
                        if text_from_user and text_from_user.strip():
                            logger.info(f"Processing user input: {text_from_user}")
                            
                            # Process through RAG pipeline
                            if self.t2t_bool:
                                processed_response = self.t2t_with_rag(text_from_user)
                            else:
                                processed_response = text_from_user

                            try:
                                await session.send_client_content(
                                    turns=types.Content(
                                        role='user', 
                                        parts=[types.Part(text=processed_response)]
                                    )
                                )
                                
                                async for resp_chunk in session.receive():
                                    if self.quit.is_set():
                                        break
                                        
                                    if resp_chunk.data:
                                        array = np.frombuffer(resp_chunk.data, dtype=np.int16)
                                        self.output_queue.put_nowait((self.output_sample_rate, array))
                                        
                            except Exception as e:
                                logger.error(f"Error in session communication: {e}")
                                
            except Exception as e:
                logger.error(f"Error in live session: {e}")
                
        except Exception as e:
            logger.error(f"Error in start_up: {e}")
        finally:
            self.is_active = False

    async def stream(self) -> AsyncGenerator[str, None]:
        """Stream text messages with stop capability"""
        while not self.quit.is_set():
            try:
                text_to_speak = await asyncio.wait_for(self.input_queue.get(), timeout=1.0)
                if text_to_speak and not self.quit.is_set():
                    yield text_to_speak
            except asyncio.TimeoutError:
                continue
            except Exception as e:
                logger.error(f"Error in stream: {e}")
                break

    async def receive(self, frame: tuple[int, np.ndarray]) -> None:
        """Receive and process audio frames with VAD"""
        if self.quit.is_set():
            return
            
        try:
            sr, array = frame
            audio_bytes = array.tobytes()
            self.internal_buffer.extend(audio_bytes)

            # VAD processing if available
            if not self.vad_available:
                # Simple fallback without VAD
                if len(self.internal_buffer) > self.VAD_FRAME_BYTES * 10:  # Collect some audio
                    full_utterance_np = np.frombuffer(self.internal_buffer, dtype=np.int16)
                    audio_input_wav = numpy_array_to_wav_bytes(full_utterance_np, sr)
                    text_input = self.s2t(audio_input_wav)
                    
                    if text_input and text_input.strip():
                        self.input_queue.put_nowait(text_input)
                    
                    self.internal_buffer = bytearray()
                return

            # Original VAD processing
            while len(self.internal_buffer) >= self.VAD_FRAME_BYTES:
                if self.quit.is_set():
                    break
                    
                vad_frame = self.internal_buffer[:self.VAD_FRAME_BYTES]
                self.internal_buffer = self.internal_buffer[self.VAD_FRAME_BYTES:]
                
                try:
                    is_speech = self.vad.is_speech(vad_frame, self.VAD_RATE)
                except Exception as e:
                    logger.error(f"VAD error: {e}")
                    continue

                if not self.vad_triggered:
                    self.vad_ring_buffer.append((vad_frame, is_speech))
                    num_voiced = len([f for f, speech in self.vad_ring_buffer if speech])
                    if num_voiced > self.vad_ratio * self.vad_ring_buffer.maxlen:
                        logger.info("Speech detected, starting to record...")
                        self.vad_triggered = True
                        for f, s in self.vad_ring_buffer:
                            self.wav_data.extend(f)
                        self.vad_ring_buffer.clear()
                else:
                    self.wav_data.extend(vad_frame)
                    self.vad_ring_buffer.append((vad_frame, is_speech))
                    num_unvoiced = len([f for f, speech in self.vad_ring_buffer if not speech])
                    if num_unvoiced > self.vad_ratio * self.vad_ring_buffer.maxlen:
                        logger.info("End of speech detected.")
                        self.vad_triggered = False
                        
                        try:
                            full_utterance_np = np.frombuffer(self.wav_data, dtype=np.int16)
                            audio_input_wav = numpy_array_to_wav_bytes(full_utterance_np, sr)
                            text_input = self.s2t(audio_input_wav)
                            
                            if text_input and text_input.strip():
                                self.input_queue.put_nowait(text_input)
                            
                        except Exception as e:
                            logger.error(f"Error processing speech: {e}")
                        
                        self.vad_ring_buffer.clear()
                        self.wav_data = bytearray()
                        
        except Exception as e:
            logger.error(f"Error in receive: {e}")

    async def emit(self) -> tuple[int, np.ndarray] | None:
        """Emit audio output with stop capability"""
        try:
            return await asyncio.wait_for(wait_for_item(self.output_queue), timeout=1.0)
        except asyncio.TimeoutError:
            return None
        except Exception as e:
            logger.error(f"Error in emit: {e}")
            return None

# Global handle
# ---------------------------
# Audio Transcription
# ---------------------------

def transcribe_audio(audio_filepath):
    logger.info(f"Starting audio transcription for: {audio_filepath}")
    api_key = os.getenv("GEMINI_API_KEY")
    if not api_key:
        logger.error("GEMINI_API_KEY environment variable not set.")
        yield "[ERROR] API Key is missing. Please configure your environment."
        return

    if not audio_filepath or not os.path.exists(audio_filepath):
        logger.error(f"Audio file does not exist at path: {audio_filepath}")
        yield "[ERROR] Audio file not found. Please record or upload again."
        return

    genai.configure(api_key=api_key)
    model = genai.GenerativeModel(model_name=config["audio"]["tts_model"])

    logger.info(f"Uploading audio file for transcription: {audio_filepath}")
    yield "Status: Uploading audio..."
    audio_file = genai.upload_file(path=audio_filepath)

    while audio_file.state.name == "PROCESSING":
        yield "Status: Processing uploaded file..."
        time.sleep(2)
        audio_file = genai.get_file(audio_file.name)

    if audio_file.state.name == "FAILED":
        logger.error("Google AI file processing failed.")
        yield "[ERROR] Audio file processing failed on the server."
        return

    yield "Status: Transcribing..."
    response = model.generate_content(
        ["Please transcribe this audio recording accurately.", audio_file],
        request_options={"timeout": 120})
    genai.delete_file(audio_file.name)

    if response and hasattr(response, 'text') and response.text:
        query = response.text.strip()
        logger.info(f"Transcription complete, length={len(query)}")
        yield query
    else:
        logger.error("Transcription failed: empty/malformed response.")
        yield "[ERROR] Transcription failed: The model returned an empty response."



def get_transcription_or_text(text_input, audio_input):
    """Extract text from either text input or audio input."""
    if text_input and text_input.strip():
        logger.info(f"Processing text query...")
        return text_input.strip(), "Status: Processing text query..."
    if audio_input is not None:
        try:
            transcription_result = transcribe_audio(audio_input)
            # Handle generator or direct result
            if hasattr(transcription_result, '__iter__') and not isinstance(transcription_result, str):
                for result in transcription_result:
                    if result.startswith("[ERROR]"):
                        return result, "error"
                    return result, "Status: Processing audio transcription..."
            else:
                if transcription_result.startswith("[ERROR]"):
                    return transcription_result, "error"
                return transcription_result, "Status: Processing audio transcription..."
        except Exception as e:
            logger.error(f"Transcription error: {e}")
            return f"[ERROR] Transcription failed: {e}", "error"
    return None, "Status: Please type a question or provide an audio recording."




def generate_tts_response(cleaned_text, voice_name):
    """Generate TTS response using Gemini."""
    try:
        tts_config = types.GenerateContentConfig(
            response_modalities=["AUDIO"],
            speech_config=types.SpeechConfig(
                voice_config=types.VoiceConfig(
                    prebuilt_voice_config=types.PrebuiltVoiceConfig(voice_name=voice_name)
                )
            )
        )
        # Fixed: Call gemini_client() function to get the client instance
        client = gemini_client()
        response = client.models.generate_content(
            model=config["audio"]["tts_model"],
            contents=cleaned_text,
            config=tts_config)

        if not response.candidates or not response.candidates[0].content.parts:
            logger.warning("Model did not return audio content")
            return None, "Status: Model did not return audio."
        
        pcm_data = response.candidates[0].content.parts[0].inline_data.data
        return (24000, np.frombuffer(pcm_data, dtype=np.int16)), "Status: Success!"
        
    except Exception as e:
        logger.error(f"TTS Error: {e}")
        return None, f"Status: An error occurred during TTS: {e}"


def process_input_and_generate_speech(text_input, audio_input, voice_name, chat_history):
    """Process user input and generate speech response."""
    try:
        query, status = get_transcription_or_text(text_input, audio_input)
        if not query:
            # Return proper message format for Gradio chatbot
            new_history = chat_history + [{"role": "assistant", "content": status}]
            return new_history, None, status, text_input, None

        is_first_turn = len(chat_history) == 0
        new_history = chat_history + [{"role": "user", "content": query}]
        response_html = process_query(query, first_turn=is_first_turn)
        new_history.append({"role": "assistant", "content": response_html})

        # Clean text for TTS
        cleaned_text = re.sub('<[^<]+?>', '', response_html).strip()
        if not cleaned_text:
            new_history[-1]["content"] = "The pipeline returned an empty response."
            return new_history, None, "Status: Error - Empty response.", "", None

        # Generate TTS
        audio_data, tts_status = generate_tts_response(cleaned_text, voice_name)
        if not audio_data:
            # Add TTS status to the response
            new_history[-1]["content"] = response_html + f"<br><br><i>({tts_status})</i>"
            return new_history, None, tts_status, "", None
        return new_history, audio_data, tts_status, "", None
        
    except Exception as e:
        logger.error(f"Error in process_input_and_generate_speech: {e}")
        error_history = chat_history + [{"role": "assistant", "content": f"An error occurred: {str(e)}"}]
        return error_history, None, f"Status: Error - {str(e)}", "", None


# ---------------------------
# Testing Functions
# ---------------------------

def test_encode_audio_functions():
    """Test audio encoding functions"""
    print("\n" + "="*60)
    print("TESTING AUDIO ENCODING FUNCTIONS")
    print("="*60)
    
    results = {}
    
    # Create test audio data
    test_data = np.array([1, 2, 3, 4, 5], dtype=np.int16)
    
    try:
        # Test encode_audio
        print("Testing encode_audio...")
        result1 = encode_audio(test_data)
        
        expected_keys = {'mime_type', 'data'}
        if set(result1.keys()) == expected_keys and result1['mime_type'] == 'audio/pcm':
            print("✅ encode_audio: PASS")
            results['encode_audio'] = "✅ PASS"
        else:
            print("❌ encode_audio: FAIL - incorrect format")
            results['encode_audio'] = "❌ FAIL"
            
    except Exception as e:
        print(f"❌ encode_audio: ERROR - {e}")
        results['encode_audio'] = f"❌ ERROR: {e}"
    
    try:
        # Test encode_audio2
        print("Testing encode_audio2...")
        result2 = encode_audio2(test_data)
        
        if isinstance(result2, bytes) and len(result2) > 0:
            print("✅ encode_audio2: PASS")
            results['encode_audio2'] = "✅ PASS"
        else:
            print("❌ encode_audio2: FAIL - not bytes or empty")
            results['encode_audio2'] = "❌ FAIL"
            
    except Exception as e:
        print(f"❌ encode_audio2: ERROR - {e}")
        results['encode_audio2'] = f"❌ ERROR: {e}"
    
    return results


def test_numpy_to_wav_conversion():
    """Test numpy array to WAV conversion"""
    print("\n" + "="*60)
    print("TESTING NUMPY TO WAV CONVERSION")
    print("="*60)
    
    results = {}
    
    # Create test audio data - sine wave
    sample_rate = 16000
    duration = 0.1  # 0.1 seconds
    frequency = 440  # A4 note
    t = np.linspace(0, duration, int(sample_rate * duration))
    test_audio = (np.sin(2 * np.pi * frequency * t) * 32767).astype(np.int16)
    
    try:
        print("Testing numpy_array_to_wav_bytes...")
        wav_bytes = numpy_array_to_wav_bytes(test_audio, sample_rate)
        
        if isinstance(wav_bytes, bytes) and len(wav_bytes) > 44:  # WAV header is 44 bytes
            print(f"✅ WAV conversion: PASS - Generated {len(wav_bytes)} bytes")
            results['wav_conversion'] = "✅ PASS"
            
            # Check if it starts with WAV header
            if wav_bytes[:4] == b'RIFF' and wav_bytes[8:12] == b'WAVE':
                print("✅ WAV header validation: PASS")
                results['wav_header'] = "✅ PASS"
            else:
                print("⚠️ WAV header validation: WARNING - may not be valid WAV")
                results['wav_header'] = "⚠️ WARNING"
        else:
            print("❌ WAV conversion: FAIL - invalid output")
            results['wav_conversion'] = "❌ FAIL"
            
    except Exception as e:
        print(f"❌ WAV conversion: ERROR - {e}")
        results['wav_conversion'] = f"❌ ERROR: {e}"
    
    return results


def test_gemini_handler_initialization():
    """Test GeminiHandler class initialization"""
    print("\n" + "="*60)
    print("TESTING GEMINI HANDLER INITIALIZATION")
    print("="*60)
    
    results = {}
    
    try:
        print("Testing GeminiHandler initialization...")
        handler = GeminiHandler()
        
        # Check basic attributes
        checks = {
            'input_queue': isinstance(handler.input_queue, asyncio.Queue),
            'output_queue': isinstance(handler.output_queue, asyncio.Queue),
            'quit_event': isinstance(handler.quit, asyncio.Event),
            'vad_initialized': hasattr(handler, 'vad'),
            'config_loaded': hasattr(handler, 'model') and handler.model is not None
        }
        
        passed_checks = sum(checks.values())
        total_checks = len(checks)
        
        print(f"Initialization checks: {passed_checks}/{total_checks}")
        for check_name, passed in checks.items():
            status = "✅" if passed else "❌"
            print(f"  {status} {check_name}")
        
        if passed_checks == total_checks:
            results['gemini_handler_init'] = "✅ PASS"
        else:
            results['gemini_handler_init'] = f"⚠️ PARTIAL: {passed_checks}/{total_checks}"
            
    except Exception as e:
        print(f"❌ GeminiHandler initialization: ERROR - {e}")
        results['gemini_handler_init'] = f"❌ ERROR: {e}"
    
    try:
        print("Testing GeminiHandler copy method...")
        handler = GeminiHandler()
        handler_copy = handler.copy()
        
        if isinstance(handler_copy, GeminiHandler) and handler_copy is not handler:
            print("✅ Copy method: PASS")
            results['gemini_handler_copy'] = "✅ PASS"
        else:
            print("❌ Copy method: FAIL")
            results['gemini_handler_copy'] = "❌ FAIL"
            
    except Exception as e:
        print(f"❌ Copy method: ERROR - {e}")
        results['gemini_handler_copy'] = f"❌ ERROR: {e}"
    
    return results


def test_transcription_function_validation():
    """Test transcribe_audio function validation (without actual API calls)"""
    print("\n" + "="*60)
    print("TESTING TRANSCRIPTION FUNCTION VALIDATION")
    print("="*60)
    
    results = {}
    
    # Test with missing API key
    print("Testing with missing API key...")
    original_key = os.environ.get("GEMINI_API_KEY")
    if original_key:
        del os.environ["GEMINI_API_KEY"]
    
    try:
        gen = transcribe_audio("nonexistent.wav")
        result = next(gen)
        
        if result.startswith("[ERROR]") and "API Key" in result:
            print("✅ API key validation: PASS")
            results['api_key_validation'] = "✅ PASS"
        else:
            print("❌ API key validation: FAIL")
            results['api_key_validation'] = "❌ FAIL"
            
    except Exception as e:
        print(f"❌ API key validation: ERROR - {e}")
        results['api_key_validation'] = f"❌ ERROR: {e}"
    
    # Restore API key
    if original_key:
        os.environ["GEMINI_API_KEY"] = original_key
    
    # Test with nonexistent file
    print("Testing with nonexistent file...")
    try:
        gen = transcribe_audio("definitely_nonexistent_file.wav")
        result = next(gen)
        
        if result.startswith("[ERROR]") and "not found" in result:
            print("✅ File validation: PASS")
            results['file_validation'] = "✅ PASS"
        else:
            print("❌ File validation: FAIL")
            results['file_validation'] = "❌ FAIL"
            
    except Exception as e:
        print(f"❌ File validation: ERROR - {e}")
        results['file_validation'] = f"❌ ERROR: {e}"
    
    return results


def test_text_input_processing():
    """Test get_transcription_or_text function"""
    print("\n" + "="*60)
    print("TESTING TEXT INPUT PROCESSING")
    print("="*60)
    
    results = {}
    
    # Test with text input
    print("Testing with text input...")
    try:
        text_input = "What is autism?"
        audio_input = None
        
        query, status = get_transcription_or_text(text_input, audio_input)
        
        if query == text_input and "text query" in status:
            print("✅ Text input processing: PASS")
            results['text_input'] = "✅ PASS"
        else:
            print("❌ Text input processing: FAIL")
            results['text_input'] = "❌ FAIL"
            
    except Exception as e:
        print(f"❌ Text input processing: ERROR - {e}")
        results['text_input'] = f"❌ ERROR: {e}"
    
    # Test with empty inputs
    print("Testing with empty inputs...")
    try:
        query, status = get_transcription_or_text("", None)
        
        if query is None and "Please type" in status:
            print("✅ Empty input handling: PASS")
            results['empty_input'] = "✅ PASS"
        else:
            print("❌ Empty input handling: FAIL")
            results['empty_input'] = "❌ FAIL"
            
    except Exception as e:
        print(f"❌ Empty input handling: ERROR - {e}")
        results['empty_input'] = f"❌ ERROR: {e}"
    
    # Test with whitespace only
    print("Testing with whitespace input...")
    try:
        query, status = get_transcription_or_text("   \n\t  ", None)
        
        if query is None and "Please type" in status:
            print("✅ Whitespace input handling: PASS")
            results['whitespace_input'] = "✅ PASS"
        else:
            print("❌ Whitespace input handling: FAIL")
            results['whitespace_input'] = "❌ FAIL"
            
    except Exception as e:
        print(f"❌ Whitespace input handling: ERROR - {e}")
        results['whitespace_input'] = f"❌ ERROR: {e}"
    
    return results


def test_tts_function_structure():
    """Test TTS function structure and error handling"""
    print("\n" + "="*60)
    print("TESTING TTS FUNCTION STRUCTURE")
    print("="*60)
    
    results = {}
    
    # Test with invalid voice name
    print("Testing TTS function error handling...")
    try:
        # This should fail gracefully
        audio_data, status = generate_tts_response("Hello world", "invalid_voice")
        
        if audio_data is None and "error" in status.lower():
            print("✅ TTS error handling: PASS")
            results['tts_error_handling'] = "✅ PASS"
        elif audio_data is not None:
            print("✅ TTS function: UNEXPECTED SUCCESS - function worked")
            results['tts_error_handling'] = "✅ UNEXPECTED SUCCESS"
        else:
            print("❌ TTS error handling: FAIL")
            results['tts_error_handling'] = "❌ FAIL"
            
    except Exception as e:
        # This is expected if API is not available
        print(f"✅ TTS error handling: EXPECTED ERROR - {str(e)[:100]}")
        results['tts_error_handling'] = "✅ EXPECTED ERROR"
    
    # Test with empty text
    print("Testing TTS with empty text...")
    try:
        audio_data, status = generate_tts_response("", "Puck")
        
        if audio_data is None:
            print("✅ Empty text handling: PASS")
            results['tts_empty_text'] = "✅ PASS"
        else:
            print("⚠️ Empty text handling: WARNING - generated audio for empty text")
            results['tts_empty_text'] = "⚠️ WARNING"
            
    except Exception as e:
        print(f"✅ Empty text handling: EXPECTED ERROR - {str(e)[:100]}")
        results['tts_empty_text'] = "✅ EXPECTED ERROR"
    
    return results


def test_main_processing_function():
    """Test the main process_input_and_generate_speech function"""
    print("\n" + "="*60)
    print("TESTING MAIN PROCESSING FUNCTION")
    print("="*60)
    
    results = {}
    
    # Test with valid text input
    print("Testing main processing with text input...")
    try:
        text_input = "What is autism?"
        audio_input = None
        voice_name = "Puck"
        chat_history = []
        
        new_history, audio_data, status, cleared_text, cleared_audio = process_input_and_generate_speech(
            text_input, audio_input, voice_name, chat_history
        )
        
        # Check if function returns expected structure
        expected_items = 5
        if len([new_history, audio_data, status, cleared_text, cleared_audio]) == expected_items:
            print("✅ Return structure: PASS - correct number of return values")
            
            # Check if history is updated
            if isinstance(new_history, list) and len(new_history) >= 2:
                print("✅ Chat history update: PASS")
                results['history_update'] = "✅ PASS"
            else:
                print("❌ Chat history update: FAIL")
                results['history_update'] = "❌ FAIL"
            
            # Check status
            if isinstance(status, str):
                print("✅ Status return: PASS")
                results['status_return'] = "✅ PASS"
            else:
                print("❌ Status return: FAIL")
                results['status_return'] = "❌ FAIL"
                
        else:
            print(f"❌ Return structure: FAIL - expected {expected_items} items")
            results['return_structure'] = "❌ FAIL"
            
    except Exception as e:
        print(f"⚠️ Main processing: EXPECTED ERROR - {str(e)[:100]}")
        results['main_processing'] = "⚠️ EXPECTED ERROR (API dependency)"
    
    # Test with empty inputs
    print("Testing main processing with empty inputs...")
    try:
        new_history, audio_data, status, cleared_text, cleared_audio = process_input_and_generate_speech(
            "", None, "Puck", []
        )
        
        if isinstance(status, str) and "Please type" in status:
            print("✅ Empty input handling: PASS")
            results['empty_input_main'] = "✅ PASS"
        else:
            print("❌ Empty input handling: FAIL")
            results['empty_input_main'] = "❌ FAIL"
            
    except Exception as e:
        print(f"❌ Empty input handling: ERROR - {e}")
        results['empty_input_main'] = f"❌ ERROR: {e}"
    
    return results


def test_environment_and_config():
    """Test environment variables and configuration loading"""
    print("\n" + "="*60)
    print("TESTING ENVIRONMENT AND CONFIGURATION")
    print("="*60)
    
    results = {}
    
    # Test configuration loading
    try:
        print("Testing configuration loading...")
        required_config_keys = ['audio']
        
        config_checks = {}
        for key in required_config_keys:
            config_checks[key] = key in config
        
        if all(config_checks.values()):
            print("✅ Config loading: PASS")
            results['config_loading'] = "✅ PASS"
        else:
            failed_keys = [k for k, v in config_checks.items() if not v]
            print(f"❌ Config loading: FAIL - missing keys: {failed_keys}")
            results['config_loading'] = f"❌ FAIL: missing {failed_keys}"
            
    except Exception as e:
        print(f"❌ Config loading: ERROR - {e}")
        results['config_loading'] = f"❌ ERROR: {e}"
    
    # Test audio config specifically
    try:
        print("Testing audio configuration...")
        if 'audio' in config:
            audio_config = config['audio']
            required_audio_keys = ['model_live', 'tts_model', 'stt_model', 'VAD_RATE', 'VAD_FRAME_MS']
            
            audio_checks = {}
            for key in required_audio_keys:
                audio_checks[key] = key in audio_config
            
            passed_audio = sum(audio_checks.values())
            total_audio = len(audio_checks)
            
            print(f"Audio config checks: {passed_audio}/{total_audio}")
            for key, passed in audio_checks.items():
                status = "✅" if passed else "❌"
                print(f"  {status} {key}")
            
            if passed_audio == total_audio:
                results['audio_config'] = "✅ PASS"
            else:
                results['audio_config'] = f"⚠️ PARTIAL: {passed_audio}/{total_audio}"
        else:
            print("❌ Audio configuration: FAIL - no audio section")
            results['audio_config'] = "❌ FAIL"
            
    except Exception as e:
        print(f"❌ Audio configuration: ERROR - {e}")
        results['audio_config'] = f"❌ ERROR: {e}"
    
    # Test environment variables
    print("Testing environment variables...")
    env_vars = ['GEMINI_API_KEY', 'SILICONFLOW_API_KEY']
    env_results = {}
    
    for var in env_vars:
        value = os.getenv(var)
        if value:
            print(f"✅ {var}: SET")
            env_results[var] = "✅ SET"
        else:
            print(f"❌ {var}: NOT SET")
            env_results[var] = "❌ NOT SET"
    
    results.update(env_results)
    return results


def create_test_audio_file(filename="test_audio.wav", duration=1.0, sample_rate=16000):
    """Create a test audio file for testing purposes"""
    try:
        # Generate a simple sine wave
        t = np.linspace(0, duration, int(sample_rate * duration))
        frequency = 440  # A4 note
        audio_data = (np.sin(2 * np.pi * frequency * t) * 0.3 * 32767).astype(np.int16)
        
        # Save as WAV file
        sf.write(filename, audio_data, sample_rate)
        return filename
    except Exception as e:
        print(f"Failed to create test audio file: {e}")
        return None


def run_performance_benchmarks():
    """Run performance benchmarks on key functions"""
    print("\n" + "="*60)
    print("RUNNING PERFORMANCE BENCHMARKS")
    print("="*60)
    
    results = {}
    
    # Benchmark encode_audio functions
    print("Benchmarking audio encoding functions...")
    test_data_sizes = [1000, 10000, 100000]  # Different sizes
    
    for size in test_data_sizes:
        test_data = np.random.randint(-32768, 32767, size, dtype=np.int16)
        
        # Benchmark encode_audio
        start_time = time.time()
        for _ in range(100):  # 100 iterations
            encode_audio(test_data)
        encode_audio_time = (time.time() - start_time) / 100
        
        # Benchmark encode_audio2
        start_time = time.time()
        for _ in range(100):
            encode_audio2(test_data)
        encode_audio2_time = (time.time() - start_time) / 100
        
        print(f"Size {size} samples:")
        print(f"  encode_audio: {encode_audio_time*1000:.2f}ms")
        print(f"  encode_audio2: {encode_audio2_time*1000:.2f}ms")
        
        results[f'encode_audio_{size}'] = f"{encode_audio_time*1000:.2f}ms"
        results[f'encode_audio2_{size}'] = f"{encode_audio2_time*1000:.2f}ms"
    
    # Benchmark WAV conversion
    print("\nBenchmarking WAV conversion...")
    test_audio = np.random.randint(-32768, 32767, 16000, dtype=np.int16)  # 1 second
    
    start_time = time.time()
    for _ in range(10):
        numpy_array_to_wav_bytes(test_audio)
    wav_time = (time.time() - start_time) / 10
    
    print(f"WAV conversion (1s audio): {wav_time*1000:.2f}ms")
    results['wav_conversion_benchmark'] = f"{wav_time*1000:.2f}ms"
    
    return results


def run_integration_tests():
    """Run integration tests that test multiple components together"""
    print("\n" + "="*60)
    print("RUNNING INTEGRATION TESTS")
    print("="*60)
    
    results = {}
    
    # Test GeminiHandler + audio encoding integration
    print("Testing GeminiHandler initialization with audio encoding...")
    try:
        handler = GeminiHandler()
        test_data = np.array([1, 2, 3, 4, 5], dtype=np.int16)
        
        # Test if handler can work with encoded audio
        encoded = encode_audio(test_data)
        raw_bytes = encode_audio2(test_data)
        
        if handler and encoded and raw_bytes:
            print("✅ Handler + Encoding integration: PASS")
            results['handler_encoding'] = "✅ PASS"
        else:
            print("❌ Handler + Encoding integration: FAIL")
            results['handler_encoding'] = "❌ FAIL"
            
    except Exception as e:
        print(f"❌ Handler + Encoding integration: ERROR - {e}")
        results['handler_encoding'] = f"❌ ERROR: {e}"
    
    # Test text processing pipeline
    print("Testing text processing pipeline...")
    try:
        text_input = "Hello world"
        query, status = get_transcription_or_text(text_input, None)
        
        if query == text_input and "text query" in status:
            print("✅ Text processing pipeline: PASS")
            results['text_pipeline'] = "✅ PASS"
        else:
            print("❌ Text processing pipeline: FAIL")
            results['text_pipeline'] = "❌ FAIL"
            
    except Exception as e:
        print(f"❌ Text processing pipeline: ERROR - {e}")
        results['text_pipeline'] = f"❌ ERROR: {e}"
    
    return results


def run_all_tests():
    """Run all test functions and provide a comprehensive report"""
    print("\n" + "🧪" + "="*58)
    print("🧪 RUNNING COMPREHENSIVE AUDIO UTILS TESTS")
    print("🧪" + "="*58)
    
    test_results = {}
    
    # Run all test categories
    print("Starting audio utilities test suite...")
    
    test_results["Environment & Config"] = test_environment_and_config()
    test_results["Audio Encoding"] = test_encode_audio_functions()
    test_results["WAV Conversion"] = test_numpy_to_wav_conversion()
    test_results["GeminiHandler"] = test_gemini_handler_initialization()
    test_results["Transcription Validation"] = test_transcription_function_validation()
    test_results["Text Processing"] = test_text_input_processing()
    test_results["TTS Structure"] = test_tts_function_structure()
    test_results["Main Processing"] = test_main_processing_function()
    test_results["Performance"] = run_performance_benchmarks()
    test_results["Integration"] = run_integration_tests()
    
    # Print comprehensive summary
    print("\n" + "📋" + "="*58)
    print("📋 COMPREHENSIVE TEST SUMMARY")
    print("📋" + "="*58)
    
    total_categories = len(test_results)
    passed_categories = 0
    
    for category, results in test_results.items():
        print(f"\n🔧 {category}:")
        
        if isinstance(results, dict):
            category_passed = 0
            category_total = 0
            
            for test_name, result in results.items():
                category_total += 1
                if result.startswith("✅"):
                    category_passed += 1
                    status = "PASS"
                elif result.startswith("⚠️"):
                    status = "WARNING"
                else:
                    status = "FAIL/ERROR"
                
                print(f"  • {test_name}: {status}")
            
            category_success_rate = category_passed / category_total if category_total > 0 else 0
            if category_success_rate >= 0.8:  # 80% success rate
                passed_categories += 1
            
            print(f"  📊 Category Score: {category_passed}/{category_total} ({category_success_rate:.1%})")
        
        else:
            print(f"  📊 {results}")
    
    # Overall summary
    overall_success_rate = passed_categories / total_categories
    print(f"\n🏆 OVERALL RESULTS:")
    print(f"   Categories Passed: {passed_categories}/{total_categories}")
    print(f"   Success Rate: {overall_success_rate:.1%}")
    
    if overall_success_rate >= 0.8:
        print("   Status: ✅ SYSTEM READY")
    elif overall_success_rate >= 0.6:
        print("   Status: ⚠️ NEEDS ATTENTION")
    else:
        print("   Status: ❌ REQUIRES FIXES")
    
    print("\n🏁 Audio utilities testing completed!")
    return test_results


if __name__ == "__main__":
    logger.info("Audio utils module loaded successfully.")
    
    # Interactive testing menu
    print("\n" + "🎵" + "="*58)
    print("🎵 AUDIO UTILS TESTING SUITE")
    print("🎵" + "="*58)
    
    import sys
    
    if len(sys.argv) > 1:
        # Command line mode
        mode = sys.argv[1].lower()
        
        if mode == "all":
            run_all_tests()
        elif mode == "encoding":
            test_encode_audio_functions()
        elif mode == "wav":
            test_numpy_to_wav_conversion()
        elif mode == "handler":
            test_gemini_handler_initialization()
        elif mode == "transcription":
            test_transcription_function_validation()
        elif mode == "text":
            test_text_input_processing()
        elif mode == "tts":
            test_tts_function_structure()
        elif mode == "main":
            test_main_processing_function()
        elif mode == "env":
            test_environment_and_config()
        elif mode == "performance":
            run_performance_benchmarks()
        elif mode == "integration":
            run_integration_tests()
        else:
            print(f"Unknown test mode: {mode}")
            print("Available modes: all, encoding, wav, handler, transcription, text, tts, main, env, performance, integration")
    
    else:
        # Interactive mode
        while True:
            print("\n" + "🎵" + " "*20 + "TEST MENU" + " "*20 + "🎵")
            print("1.  🌐 Run All Tests")
            print("2.  🔧 Environment & Config")
            print("3.  🎧 Audio Encoding Functions")
            print("4.  🎵 WAV Conversion")
            print("5.  🤖 GeminiHandler Tests")
            print("6.  🎤 Transcription Validation")
            print("7.  📝 Text Processing")
            print("8.  🔊 TTS Function Structure")
            print("9.  🎛️  Main Processing Function")
            print("10. ⚡ Performance Benchmarks")
            print("11. 🔗 Integration Tests")
            print("12. 🧪 Create Test Audio File")
            print("0.  🚪 Exit")
            
            choice = input("\nEnter your choice (0-12): ").strip()
            
            if choice == "1":
                run_all_tests()
            elif choice == "2":
                test_environment_and_config()
            elif choice == "3":
                test_encode_audio_functions()
            elif choice == "4":
                test_numpy_to_wav_conversion()
            elif choice == "5":
                test_gemini_handler_initialization()
            elif choice == "6":
                test_transcription_function_validation()
            elif choice == "7":
                test_text_input_processing()
            elif choice == "8":
                test_tts_function_structure()
            elif choice == "9":
                test_main_processing_function()
            elif choice == "10":
                run_performance_benchmarks()
            elif choice == "11":
                run_integration_tests()
            elif choice == "12":
                filename = create_test_audio_file()
                if filename:
                    print(f"✅ Test audio file created: {filename}")
                else:
                    print("❌ Failed to create test audio file")
            elif choice == "0":
                print("\n👋 Audio testing complete!")
                break
            else:
                print("❌ Invalid choice. Please try again.")
            
            input("\nPress Enter to continue...")