File size: 4,056 Bytes
e900bec
868d5da
c28aac9
e900bec
 
 
 
 
868d5da
e900bec
 
 
c5ba0eb
e900bec
180dd60
 
e900bec
180dd60
 
 
e900bec
 
 
 
868d5da
e900bec
180dd60
 
e900bec
 
 
 
 
 
dd39e3f
c28aac9
e900bec
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
c28aac9
e900bec
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
c28aac9
e900bec
c28aac9
e900bec
 
 
 
 
 
 
 
 
 
 
 
 
c28aac9
 
e900bec
c28aac9
2e9d189
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
# Clone: https://huggingface.co/spaces/Lum4yx/mcp-sentiment/blob/main/app.py
import gradio as gr
from textblob import TextBlob
from transformers import AutoModelForSpeechSeq2Seq, AutoProcessor, pipeline
import torch
import numpy as np
import os
import glob

# 1. Set up device and data type for optimized performance
device = "cuda:0" if torch.cuda.is_available() else "cpu"
torch_dtype = torch.float16 if torch.cuda.is_available() else torch.float32

# 2. Define the model ID for the Whisper model
model_id = "onnx-community/whisper-podlodka-turbo-ONNX"

# 3. Load the model from pretrained weights
model = AutoModelForSpeechSeq2Seq.from_pretrained(
    model_id, torch_dtype=torch_dtype, low_cpu_mem_usage=True, use_safetensors=True
)
model.to(device)

# 4. Load the processor which includes the feature extractor and tokenizer
processor = AutoProcessor.from_pretrained(model_id)

# 5. Create the ASR pipeline with the loaded components
pipe = pipeline(
    "automatic-speech-recognition",
    model=model,
    tokenizer=processor.tokenizer,
    feature_extractor=processor.feature_extractor,
    max_new_tokens=128,
    torch_dtype=torch_dtype,
    device=device,
)

def sentiment_analysis(text: str) -> dict:
    """
    Analyze the sentiment of the given text.
    """
    blob = TextBlob(text)
    sentiment = blob.sentiment
    return {
        "transcript": text,
        "polarity": round(sentiment.polarity, 2),
        "subjectivity": round(sentiment.subjectivity, 2),
        "assessment": "positive" if sentiment.polarity > 0 else "negative" if sentiment.polarity < 0 else "neutral"
    }

# NEW: Simplified main function to process audio from a NumPy array
def analyze_audio(audio: tuple) -> dict:
    """
    Processes audio data from a NumPy array, transcribes it, and analyzes its sentiment.
    Gradio provides the audio as a tuple (sample_rate, data).
    """
    if audio is None:
        return {"error": "No audio provided. Please upload, record, or select an example."}

    # Unpack the audio tuple
    sample_rate, audio_data = audio
    
    # Convert the audio data to the format the model expects (float32)
    audio_float32 = audio_data.astype(np.float32) / 32768.0

    try:
        # Transcribe the audio
        transcription_result = pipe(audio_float32)
        transcript_text = transcription_result["text"].strip()
        
        if not transcript_text:
             return {"error": "Transcription failed or audio was silent."}

    except Exception as e:
        return {"error": f"Failed to transcribe audio: {str(e)}"}

    # Perform sentiment analysis on the transcript
    return sentiment_analysis(transcript_text)


# --- Code to find and load examples ---
examples_dir = "examples"
if not os.path.exists(examples_dir):
    os.makedirs(examples_dir)
    print(f"Created '{examples_dir}/' directory. Please add your audio examples there.")

example_files = (
    glob.glob(os.path.join(examples_dir, "*.wav")) +
    glob.glob(os.path.join(examples_dir, "*.mp3")) +
    glob.glob(os.path.join(examples_dir, "*.flac"))
)
examples_list = [[file] for file in example_files]
# --- End of example loading ---


# Create the Gradio interface
demo = gr.Interface(
    fn=analyze_audio,  # CHANGED: Point to the new, simplified function
    inputs=gr.Audio(type="numpy", label="Upload Audio File or Record"),  # CHANGED: type="numpy"
    outputs=gr.JSON(label="Analysis Result"),
    title="🎙️ Audio Sentiment Analysis (Whisper Small)",
    description="Analyze the sentiment of spoken words. Upload an audio file, record directly, or click an example below.",
    examples=examples_list,
    article="""
    ### How it Works
    This tool uses OpenAI's **Whisper Small** model to transcribe audio into text.
    Then, **TextBlob** is used to perform sentiment analysis on the resulting transcript.
    By using `type="numpy"`, the interface directly processes audio data, making it more reliable.
    """,
    theme='huggingface'
)

# Launch the interface
if __name__ == "__main__":
    demo.launch(mcp_server=True)