mcp-sentiment / app.py
Lum4yx's picture
Update app.py
68728a0 verified
import gradio as gr
from textblob import TextBlob
from transformers import AutoModelForSpeechSeq2Seq, AutoProcessor, pipeline
import torch
import numpy as np
import os
import glob
# 1. Set up device and data type for optimized performance
device = "cuda:0" if torch.cuda.is_available() else "cpu"
torch_dtype = torch.float16 if torch.cuda.is_available() else torch.float32
# 2. Define the model ID for the Whisper model
model_id = "openai/whisper-small"
# 3. Load the model from pretrained weights
model = AutoModelForSpeechSeq2Seq.from_pretrained(
model_id, torch_dtype=torch_dtype, low_cpu_mem_usage=True, use_safetensors=True
)
model.to(device)
# 4. Load the processor which includes the feature extractor and tokenizer
processor = AutoProcessor.from_pretrained(model_id)
# 5. Create the ASR pipeline with the loaded components
pipe = pipeline(
"automatic-speech-recognition",
model=model,
tokenizer=processor.tokenizer,
feature_extractor=processor.feature_extractor,
max_new_tokens=128,
torch_dtype=torch_dtype,
device=device,
)
def sentiment_analysis(text: str) -> dict:
"""
Analyze the sentiment of the given text.
"""
blob = TextBlob(text)
sentiment = blob.sentiment
return {
"transcript": text,
"polarity": round(sentiment.polarity, 2),
"subjectivity": round(sentiment.subjectivity, 2),
"assessment": "positive" if sentiment.polarity > 0 else "negative" if sentiment.polarity < 0 else "neutral"
}
# NEW: Simplified main function to process audio from a NumPy array
def analyze_audio(audio: tuple) -> dict:
"""
Processes audio data from a NumPy array, transcribes it, and analyzes its sentiment.
Gradio provides the audio as a tuple (sample_rate, data).
"""
if audio is None:
return {"error": "No audio provided. Please upload, record, or select an example."}
# Unpack the audio tuple
sample_rate, audio_data = audio
# Convert the audio data to the format the model expects (float32)
audio_float32 = audio_data.astype(np.float32) / 32768.0
try:
# Transcribe the audio
transcription_result = pipe(audio_float32)
transcript_text = transcription_result["text"].strip()
if not transcript_text:
return {"error": "Transcription failed or audio was silent."}
except Exception as e:
return {"error": f"Failed to transcribe audio: {str(e)}"}
# Perform sentiment analysis on the transcript
return sentiment_analysis(transcript_text)
# --- Code to find and load examples ---
examples_dir = "examples"
if not os.path.exists(examples_dir):
os.makedirs(examples_dir)
print(f"Created '{examples_dir}/' directory. Please add your audio examples there.")
example_files = (
glob.glob(os.path.join(examples_dir, "*.wav")) +
glob.glob(os.path.join(examples_dir, "*.mp3")) +
glob.glob(os.path.join(examples_dir, "*.flac"))
)
examples_list = [[file] for file in example_files]
# --- End of example loading ---
# Create the Gradio interface
demo = gr.Interface(
fn=analyze_audio, # CHANGED: Point to the new, simplified function
inputs=gr.Audio(type="numpy", label="Upload Audio File or Record"), # CHANGED: type="numpy"
outputs=gr.JSON(label="Analysis Result"),
title="๐ŸŽ™๏ธ Audio Sentiment Analysis (Whisper Small)",
description="Analyze the sentiment of spoken words. Upload an audio file, record directly, or click an example below.",
examples=examples_list,
article="""
### How it Works
This tool uses OpenAI's **Whisper Small** model to transcribe audio into text.
Then, **TextBlob** is used to perform sentiment analysis on the resulting transcript.
By using `type="numpy"`, the interface directly processes audio data, making it more reliable.
""",
theme='huggingface'
)
# Launch the interface
if __name__ == "__main__":
demo.launch(mcp_server=True)