Spaces:
Sleeping
Sleeping
File size: 3,957 Bytes
602f500 24006f5 ae9ed1e ceb8eda 68728a0 602f500 b08b4ee ae9ed1e 602f500 b08b4ee d1e4dc7 602f500 b08b4ee 24006f5 b08b4ee ae9ed1e b08b4ee ae9ed1e b08b4ee ae9ed1e b08b4ee 602f500 ae9ed1e 602f500 68728a0 ae9ed1e 68728a0 ae9ed1e 68728a0 ae9ed1e 68728a0 b08b4ee 68728a0 ae9ed1e 68728a0 ae9ed1e 68728a0 ae9ed1e b08b4ee ceb8eda b08b4ee ceb8eda b08b4ee ceb8eda ae9ed1e b08b4ee 602f500 68728a0 ae9ed1e ceb8eda 68728a0 b08b4ee ae9ed1e ceb8eda 68728a0 ae9ed1e ceb8eda 602f500 68728a0 602f500 ceb8eda |
1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 |
import gradio as gr
from textblob import TextBlob
from transformers import AutoModelForSpeechSeq2Seq, AutoProcessor, pipeline
import torch
import numpy as np
import os
import glob
# 1. Set up device and data type for optimized performance
device = "cuda:0" if torch.cuda.is_available() else "cpu"
torch_dtype = torch.float16 if torch.cuda.is_available() else torch.float32
# 2. Define the model ID for the Whisper model
model_id = "openai/whisper-small"
# 3. Load the model from pretrained weights
model = AutoModelForSpeechSeq2Seq.from_pretrained(
model_id, torch_dtype=torch_dtype, low_cpu_mem_usage=True, use_safetensors=True
)
model.to(device)
# 4. Load the processor which includes the feature extractor and tokenizer
processor = AutoProcessor.from_pretrained(model_id)
# 5. Create the ASR pipeline with the loaded components
pipe = pipeline(
"automatic-speech-recognition",
model=model,
tokenizer=processor.tokenizer,
feature_extractor=processor.feature_extractor,
max_new_tokens=128,
torch_dtype=torch_dtype,
device=device,
)
def sentiment_analysis(text: str) -> dict:
"""
Analyze the sentiment of the given text.
"""
blob = TextBlob(text)
sentiment = blob.sentiment
return {
"transcript": text,
"polarity": round(sentiment.polarity, 2),
"subjectivity": round(sentiment.subjectivity, 2),
"assessment": "positive" if sentiment.polarity > 0 else "negative" if sentiment.polarity < 0 else "neutral"
}
# NEW: Simplified main function to process audio from a NumPy array
def analyze_audio(audio: tuple) -> dict:
"""
Processes audio data from a NumPy array, transcribes it, and analyzes its sentiment.
Gradio provides the audio as a tuple (sample_rate, data).
"""
if audio is None:
return {"error": "No audio provided. Please upload, record, or select an example."}
# Unpack the audio tuple
sample_rate, audio_data = audio
# Convert the audio data to the format the model expects (float32)
audio_float32 = audio_data.astype(np.float32) / 32768.0
try:
# Transcribe the audio
transcription_result = pipe(audio_float32)
transcript_text = transcription_result["text"].strip()
if not transcript_text:
return {"error": "Transcription failed or audio was silent."}
except Exception as e:
return {"error": f"Failed to transcribe audio: {str(e)}"}
# Perform sentiment analysis on the transcript
return sentiment_analysis(transcript_text)
# --- Code to find and load examples ---
examples_dir = "examples"
if not os.path.exists(examples_dir):
os.makedirs(examples_dir)
print(f"Created '{examples_dir}/' directory. Please add your audio examples there.")
example_files = (
glob.glob(os.path.join(examples_dir, "*.wav")) +
glob.glob(os.path.join(examples_dir, "*.mp3")) +
glob.glob(os.path.join(examples_dir, "*.flac"))
)
examples_list = [[file] for file in example_files]
# --- End of example loading ---
# Create the Gradio interface
demo = gr.Interface(
fn=analyze_audio, # CHANGED: Point to the new, simplified function
inputs=gr.Audio(type="numpy", label="Upload Audio File or Record"), # CHANGED: type="numpy"
outputs=gr.JSON(label="Analysis Result"),
title="🎙️ Audio Sentiment Analysis (Whisper Small)",
description="Analyze the sentiment of spoken words. Upload an audio file, record directly, or click an example below.",
examples=examples_list,
article="""
### How it Works
This tool uses OpenAI's **Whisper Small** model to transcribe audio into text.
Then, **TextBlob** is used to perform sentiment analysis on the resulting transcript.
By using `type="numpy"`, the interface directly processes audio data, making it more reliable.
""",
theme='huggingface'
)
# Launch the interface
if __name__ == "__main__":
demo.launch(mcp_server=True) |