import gradio as gr
from textblob import TextBlob
from transformers import AutoModelForSpeechSeq2Seq, AutoProcessor, pipeline
import torch
import numpy as np
import os
import glob

# 1. Set up device and data type for optimized performance
device = "cuda:0" if torch.cuda.is_available() else "cpu"
torch_dtype = torch.float16 if torch.cuda.is_available() else torch.float32

# 2. Define the model ID for the Whisper model
model_id = "openai/whisper-small"

# 3. Load the model from pretrained weights
model = AutoModelForSpeechSeq2Seq.from_pretrained(
    model_id, torch_dtype=torch_dtype, low_cpu_mem_usage=True, use_safetensors=True
)
model.to(device)

# 4. Load the processor which includes the feature extractor and tokenizer
processor = AutoProcessor.from_pretrained(model_id)

# 5. Create the ASR pipeline with the loaded components
pipe = pipeline(
    "automatic-speech-recognition",
    model=model,
    tokenizer=processor.tokenizer,
    feature_extractor=processor.feature_extractor,
    max_new_tokens=128,
    torch_dtype=torch_dtype,
    device=device,
)

def sentiment_analysis(text: str) -> dict:
    """
    Analyze the sentiment of the given text.
    """
    blob = TextBlob(text)
    sentiment = blob.sentiment
    return {
        "transcript": text,
        "polarity": round(sentiment.polarity, 2),
        "subjectivity": round(sentiment.subjectivity, 2),
        "assessment": "positive" if sentiment.polarity > 0 else "negative" if sentiment.polarity < 0 else "neutral"
    }

# NEW: Simplified main function to process audio from a NumPy array
def analyze_audio(audio: tuple) -> dict:
    """
    Processes audio data from a NumPy array, transcribes it, and analyzes its sentiment.
    Gradio provides the audio as a tuple (sample_rate, data).
    """
    if audio is None:
        return {"error": "No audio provided. Please upload, record, or select an example."}

    # Unpack the audio tuple
    sample_rate, audio_data = audio
    
    # Convert the audio data to the format the model expects (float32)
    audio_float32 = audio_data.astype(np.float32) / 32768.0

    try:
        # Transcribe the audio
        transcription_result = pipe(audio_float32)
        transcript_text = transcription_result["text"].strip()
        
        if not transcript_text:
             return {"error": "Transcription failed or audio was silent."}

    except Exception as e:
        return {"error": f"Failed to transcribe audio: {str(e)}"}

    # Perform sentiment analysis on the transcript
    return sentiment_analysis(transcript_text)


# --- Code to find and load examples ---
examples_dir = "examples"
if not os.path.exists(examples_dir):
    os.makedirs(examples_dir)
    print(f"Created '{examples_dir}/' directory. Please add your audio examples there.")

example_files = (
    glob.glob(os.path.join(examples_dir, "*.wav")) +
    glob.glob(os.path.join(examples_dir, "*.mp3")) +
    glob.glob(os.path.join(examples_dir, "*.flac"))
)
examples_list = [[file] for file in example_files]
# --- End of example loading ---


# Create the Gradio interface
demo = gr.Interface(
    fn=analyze_audio,  # CHANGED: Point to the new, simplified function
    inputs=gr.Audio(type="numpy", label="Upload Audio File or Record"),  # CHANGED: type="numpy"
    outputs=gr.JSON(label="Analysis Result"),
    title="🎙️ Audio Sentiment Analysis (Whisper Small)",
    description="Analyze the sentiment of spoken words. Upload an audio file, record directly, or click an example below.",
    examples=examples_list,
    article="""
    ### How it Works
    This tool uses OpenAI's **Whisper Small** model to transcribe audio into text.
    Then, **TextBlob** is used to perform sentiment analysis on the resulting transcript.
    By using `type="numpy"`, the interface directly processes audio data, making it more reliable.
    """,
    theme='huggingface'
)

# Launch the interface
if __name__ == "__main__":
    demo.launch(mcp_server=True)