import gradio as gr from textblob import TextBlob from transformers import AutoModelForSpeechSeq2Seq, AutoProcessor, pipeline import torch import numpy as np import os import glob # 1. Set up device and data type for optimized performance device = "cuda:0" if torch.cuda.is_available() else "cpu" torch_dtype = torch.float16 if torch.cuda.is_available() else torch.float32 # 2. Define the model ID for the Whisper model model_id = "openai/whisper-small" # 3. Load the model from pretrained weights model = AutoModelForSpeechSeq2Seq.from_pretrained( model_id, torch_dtype=torch_dtype, low_cpu_mem_usage=True, use_safetensors=True ) model.to(device) # 4. Load the processor which includes the feature extractor and tokenizer processor = AutoProcessor.from_pretrained(model_id) # 5. Create the ASR pipeline with the loaded components pipe = pipeline( "automatic-speech-recognition", model=model, tokenizer=processor.tokenizer, feature_extractor=processor.feature_extractor, max_new_tokens=128, torch_dtype=torch_dtype, device=device, ) def sentiment_analysis(text: str) -> dict: """ Analyze the sentiment of the given text. """ blob = TextBlob(text) sentiment = blob.sentiment return { "transcript": text, "polarity": round(sentiment.polarity, 2), "subjectivity": round(sentiment.subjectivity, 2), "assessment": "positive" if sentiment.polarity > 0 else "negative" if sentiment.polarity < 0 else "neutral" } # NEW: Simplified main function to process audio from a NumPy array def analyze_audio(audio: tuple) -> dict: """ Processes audio data from a NumPy array, transcribes it, and analyzes its sentiment. Gradio provides the audio as a tuple (sample_rate, data). """ if audio is None: return {"error": "No audio provided. Please upload, record, or select an example."} # Unpack the audio tuple sample_rate, audio_data = audio # Convert the audio data to the format the model expects (float32) audio_float32 = audio_data.astype(np.float32) / 32768.0 try: # Transcribe the audio transcription_result = pipe(audio_float32) transcript_text = transcription_result["text"].strip() if not transcript_text: return {"error": "Transcription failed or audio was silent."} except Exception as e: return {"error": f"Failed to transcribe audio: {str(e)}"} # Perform sentiment analysis on the transcript return sentiment_analysis(transcript_text) # --- Code to find and load examples --- examples_dir = "examples" if not os.path.exists(examples_dir): os.makedirs(examples_dir) print(f"Created '{examples_dir}/' directory. Please add your audio examples there.") example_files = ( glob.glob(os.path.join(examples_dir, "*.wav")) + glob.glob(os.path.join(examples_dir, "*.mp3")) + glob.glob(os.path.join(examples_dir, "*.flac")) ) examples_list = [[file] for file in example_files] # --- End of example loading --- # Create the Gradio interface demo = gr.Interface( fn=analyze_audio, # CHANGED: Point to the new, simplified function inputs=gr.Audio(type="numpy", label="Upload Audio File or Record"), # CHANGED: type="numpy" outputs=gr.JSON(label="Analysis Result"), title="🎙️ Audio Sentiment Analysis (Whisper Small)", description="Analyze the sentiment of spoken words. Upload an audio file, record directly, or click an example below.", examples=examples_list, article=""" ### How it Works This tool uses OpenAI's **Whisper Small** model to transcribe audio into text. Then, **TextBlob** is used to perform sentiment analysis on the resulting transcript. By using `type="numpy"`, the interface directly processes audio data, making it more reliable. """, theme='huggingface' ) # Launch the interface if __name__ == "__main__": demo.launch(mcp_server=True)