mcp-sentiment / app.py
Lum4yx's picture
Update app.py
d1e4dc7 verified
raw
history blame
4.95 kB
import gradio as gr
from textblob import TextBlob
from transformers import AutoModelForSpeechSeq2Seq, AutoProcessor, pipeline
import torch
import base64
import numpy as np
import ffmpeg
# 1. Set up device and data type for optimized performance
device = "cuda:0" if torch.cuda.is_available() else "cpu"
torch_dtype = torch.float16 if torch.cuda.is_available() else torch.float32
# 2. Define the model ID for the large Whisper model
model_id = "openai/whisper-small"
# 3. Load the model from pretrained weights
model = AutoModelForSpeechSeq2Seq.from_pretrained(
model_id, torch_dtype=torch_dtype, low_cpu_mem_usage=True, use_safetensors=True
)
model.to(device)
# 4. Load the processor which includes the feature extractor and tokenizer
processor = AutoProcessor.from_pretrained(model_id)
# 5. Create the ASR pipeline with the loaded components
pipe = pipeline(
"automatic-speech-recognition",
model=model,
tokenizer=processor.tokenizer,
feature_extractor=processor.feature_extractor,
max_new_tokens=128,
torch_dtype=torch_dtype,
device=device,
)
def sentiment_analysis(text: str) -> dict:
"""
Analyze the sentiment of the given text. (This function is unchanged)
"""
blob = TextBlob(text)
sentiment = blob.sentiment
return {
"transcript": text,
"polarity": round(sentiment.polarity, 2),
"subjectivity": round(sentiment.subjectivity, 2),
"assessment": "positive" if sentiment.polarity > 0 else "negative" if sentiment.polarity < 0 else "neutral"
}
def process_base64_audio(base64_data_uri: str) -> dict:
"""
Decodes a Base64 audio data URI, processes it in-memory,
transcribes it using a Hugging Face Whisper pipeline, and then analyzes its sentiment.
Args:
base64_data_uri (str): A string in data URI format (e.g., "data:audio/wav;base64,UklGRi...").
Returns:
dict: The sentiment analysis result or an error message.
"""
if not base64_data_uri or "base64," not in base64_data_uri:
return {"error": "Invalid or empty Base64 data URI provided."}
try:
# Parse the data URI to extract the Base64 encoded data
_, encoded_data = base64_data_uri.split(',', 1)
# Decode the Base64 string into binary audio data
audio_data = base64.b64decode(encoded_data)
# Use ffmpeg to convert the in-memory audio data to a raw PCM buffer.
# The pipeline expects a 16kHz mono audio stream.
out, _ = (
ffmpeg
.input('pipe:0')
.output('pipe:1', format='s16le', ac=1, ar=16000)
.run(input=audio_data, capture_stdout=True, capture_stderr=True)
)
# Convert the raw PCM buffer to a NumPy array of 32-bit floats.
audio_np = np.frombuffer(out, np.int16).astype(np.float32) / 32768.0
# Transcribe the audio from the NumPy array using the HF pipeline
transcription_result = pipe(audio_np)
transcript_text = transcription_result["text"]
except Exception as e:
# Capture potential errors from ffmpeg or the model
return {"error": f"Failed to process audio: {str(e)}"}
# Perform sentiment analysis on the transcribed text
return sentiment_analysis(transcript_text)
# Create the Gradio interface with the Hugging Face theme
demo = gr.Interface(
fn=process_base64_audio,
# The input remains a Textbox to accept the raw Base64 string from the API client
inputs=gr.Textbox(lines=5, placeholder="Paste your Base64 encoded audio data URI here...", label="Base64 Audio Input"),
outputs=gr.JSON(label="Analysis Result"),
title="๐ŸŽ™๏ธ Audio Sentiment Analysis (Whisper Large v3)",
description="""
Analyze the sentiment of spoken words.
This tool accepts a **Base64 encoded audio data URI**, transcribes the audio in-memory using the `openai/whisper-large-v3` model,
and performs sentiment analysis on the text with TextBlob.
""",
examples=[
["data:audio/wav;base64,UklGRiQ...<placeholder_for_a_short_positive_clip>"],
["data:audio/wav;base64,UklGRiQ...<placeholder_for_a_short_negative_clip>"]
],
article="""
### How to get a Base64 Audio URI?
You can use an online converter or a script (like the provided `test_client.py`) to convert a short audio file (e.g., .wav or .mp3) into a Base64 data URI.
The format must be `data:audio/[format];base64,[encoded_string]`.
""",
theme='huggingface' # This applies the new theme
)
# Launch the interface and MCP server
if __name__ == "__main__":
# You will need to have ffmpeg installed on your system for this to work.
# You also need to install the required python packages. This model is large and requires significant resources.
# pip install gradio textblob "transformers[torch]" accelerate safetensors ffmpeg-python numpy
demo.launch(mcp_server=True)