|
|
|
|
|
import gradio as gr |
|
|
from textblob import TextBlob |
|
|
from transformers import AutoModelForSpeechSeq2Seq, AutoProcessor, pipeline |
|
|
import torch |
|
|
import numpy as np |
|
|
import os |
|
|
import glob |
|
|
|
|
|
|
|
|
device = "cuda:0" if torch.cuda.is_available() else "cpu" |
|
|
torch_dtype = torch.float16 if torch.cuda.is_available() else torch.float32 |
|
|
|
|
|
|
|
|
model_id = "onnx-community/whisper-podlodka-turbo-ONNX" |
|
|
|
|
|
|
|
|
model = AutoModelForSpeechSeq2Seq.from_pretrained( |
|
|
model_id, torch_dtype=torch_dtype, low_cpu_mem_usage=True, use_safetensors=True |
|
|
) |
|
|
model.to(device) |
|
|
|
|
|
|
|
|
processor = AutoProcessor.from_pretrained(model_id) |
|
|
|
|
|
|
|
|
pipe = pipeline( |
|
|
"automatic-speech-recognition", |
|
|
model=model, |
|
|
tokenizer=processor.tokenizer, |
|
|
feature_extractor=processor.feature_extractor, |
|
|
max_new_tokens=128, |
|
|
torch_dtype=torch_dtype, |
|
|
device=device, |
|
|
) |
|
|
|
|
|
def sentiment_analysis(text: str) -> dict: |
|
|
""" |
|
|
Analyze the sentiment of the given text. |
|
|
""" |
|
|
blob = TextBlob(text) |
|
|
sentiment = blob.sentiment |
|
|
return { |
|
|
"transcript": text, |
|
|
"polarity": round(sentiment.polarity, 2), |
|
|
"subjectivity": round(sentiment.subjectivity, 2), |
|
|
"assessment": "positive" if sentiment.polarity > 0 else "negative" if sentiment.polarity < 0 else "neutral" |
|
|
} |
|
|
|
|
|
|
|
|
def analyze_audio(audio: tuple) -> dict: |
|
|
""" |
|
|
Processes audio data from a NumPy array, transcribes it, and analyzes its sentiment. |
|
|
Gradio provides the audio as a tuple (sample_rate, data). |
|
|
""" |
|
|
if audio is None: |
|
|
return {"error": "No audio provided. Please upload, record, or select an example."} |
|
|
|
|
|
|
|
|
sample_rate, audio_data = audio |
|
|
|
|
|
|
|
|
audio_float32 = audio_data.astype(np.float32) / 32768.0 |
|
|
|
|
|
try: |
|
|
|
|
|
transcription_result = pipe(audio_float32) |
|
|
transcript_text = transcription_result["text"].strip() |
|
|
|
|
|
if not transcript_text: |
|
|
return {"error": "Transcription failed or audio was silent."} |
|
|
|
|
|
except Exception as e: |
|
|
return {"error": f"Failed to transcribe audio: {str(e)}"} |
|
|
|
|
|
|
|
|
return sentiment_analysis(transcript_text) |
|
|
|
|
|
|
|
|
|
|
|
examples_dir = "examples" |
|
|
if not os.path.exists(examples_dir): |
|
|
os.makedirs(examples_dir) |
|
|
print(f"Created '{examples_dir}/' directory. Please add your audio examples there.") |
|
|
|
|
|
example_files = ( |
|
|
glob.glob(os.path.join(examples_dir, "*.wav")) + |
|
|
glob.glob(os.path.join(examples_dir, "*.mp3")) + |
|
|
glob.glob(os.path.join(examples_dir, "*.flac")) |
|
|
) |
|
|
examples_list = [[file] for file in example_files] |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
demo = gr.Interface( |
|
|
fn=analyze_audio, |
|
|
inputs=gr.Audio(type="numpy", label="Upload Audio File or Record"), |
|
|
outputs=gr.JSON(label="Analysis Result"), |
|
|
title="🎙️ Audio Sentiment Analysis (Whisper Small)", |
|
|
description="Analyze the sentiment of spoken words. Upload an audio file, record directly, or click an example below.", |
|
|
examples=examples_list, |
|
|
article=""" |
|
|
### How it Works |
|
|
This tool uses OpenAI's **Whisper Small** model to transcribe audio into text. |
|
|
Then, **TextBlob** is used to perform sentiment analysis on the resulting transcript. |
|
|
By using `type="numpy"`, the interface directly processes audio data, making it more reliable. |
|
|
""", |
|
|
theme='huggingface' |
|
|
) |
|
|
|
|
|
|
|
|
if __name__ == "__main__": |
|
|
demo.launch(mcp_server=True) |