File size: 3,957 Bytes
602f500
 
24006f5
ae9ed1e
 
ceb8eda
68728a0
602f500
b08b4ee
ae9ed1e
 
602f500
b08b4ee
d1e4dc7
602f500
b08b4ee
24006f5
b08b4ee
ae9ed1e
 
b08b4ee
 
ae9ed1e
 
b08b4ee
ae9ed1e
 
 
 
 
 
 
 
 
 
 
 
b08b4ee
602f500
 
 
 
ae9ed1e
 
 
602f500
 
 
68728a0
 
ae9ed1e
68728a0
 
ae9ed1e
68728a0
 
ae9ed1e
68728a0
 
 
 
 
b08b4ee
 
68728a0
 
 
 
 
 
 
ae9ed1e
68728a0
ae9ed1e
68728a0
ae9ed1e
 
b08b4ee
 
 
 
 
 
 
ceb8eda
b08b4ee
 
 
ceb8eda
 
b08b4ee
ceb8eda
ae9ed1e
b08b4ee
602f500
68728a0
 
ae9ed1e
ceb8eda
68728a0
b08b4ee
ae9ed1e
ceb8eda
68728a0
 
 
ae9ed1e
ceb8eda
602f500
 
68728a0
602f500
ceb8eda
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
import gradio as gr
from textblob import TextBlob
from transformers import AutoModelForSpeechSeq2Seq, AutoProcessor, pipeline
import torch
import numpy as np
import os
import glob

# 1. Set up device and data type for optimized performance
device = "cuda:0" if torch.cuda.is_available() else "cpu"
torch_dtype = torch.float16 if torch.cuda.is_available() else torch.float32

# 2. Define the model ID for the Whisper model
model_id = "openai/whisper-small"

# 3. Load the model from pretrained weights
model = AutoModelForSpeechSeq2Seq.from_pretrained(
    model_id, torch_dtype=torch_dtype, low_cpu_mem_usage=True, use_safetensors=True
)
model.to(device)

# 4. Load the processor which includes the feature extractor and tokenizer
processor = AutoProcessor.from_pretrained(model_id)

# 5. Create the ASR pipeline with the loaded components
pipe = pipeline(
    "automatic-speech-recognition",
    model=model,
    tokenizer=processor.tokenizer,
    feature_extractor=processor.feature_extractor,
    max_new_tokens=128,
    torch_dtype=torch_dtype,
    device=device,
)

def sentiment_analysis(text: str) -> dict:
    """
    Analyze the sentiment of the given text.
    """
    blob = TextBlob(text)
    sentiment = blob.sentiment
    return {
        "transcript": text,
        "polarity": round(sentiment.polarity, 2),
        "subjectivity": round(sentiment.subjectivity, 2),
        "assessment": "positive" if sentiment.polarity > 0 else "negative" if sentiment.polarity < 0 else "neutral"
    }

# NEW: Simplified main function to process audio from a NumPy array
def analyze_audio(audio: tuple) -> dict:
    """
    Processes audio data from a NumPy array, transcribes it, and analyzes its sentiment.
    Gradio provides the audio as a tuple (sample_rate, data).
    """
    if audio is None:
        return {"error": "No audio provided. Please upload, record, or select an example."}

    # Unpack the audio tuple
    sample_rate, audio_data = audio
    
    # Convert the audio data to the format the model expects (float32)
    audio_float32 = audio_data.astype(np.float32) / 32768.0

    try:
        # Transcribe the audio
        transcription_result = pipe(audio_float32)
        transcript_text = transcription_result["text"].strip()
        
        if not transcript_text:
             return {"error": "Transcription failed or audio was silent."}

    except Exception as e:
        return {"error": f"Failed to transcribe audio: {str(e)}"}

    # Perform sentiment analysis on the transcript
    return sentiment_analysis(transcript_text)


# --- Code to find and load examples ---
examples_dir = "examples"
if not os.path.exists(examples_dir):
    os.makedirs(examples_dir)
    print(f"Created '{examples_dir}/' directory. Please add your audio examples there.")

example_files = (
    glob.glob(os.path.join(examples_dir, "*.wav")) +
    glob.glob(os.path.join(examples_dir, "*.mp3")) +
    glob.glob(os.path.join(examples_dir, "*.flac"))
)
examples_list = [[file] for file in example_files]
# --- End of example loading ---


# Create the Gradio interface
demo = gr.Interface(
    fn=analyze_audio,  # CHANGED: Point to the new, simplified function
    inputs=gr.Audio(type="numpy", label="Upload Audio File or Record"),  # CHANGED: type="numpy"
    outputs=gr.JSON(label="Analysis Result"),
    title="🎙️ Audio Sentiment Analysis (Whisper Small)",
    description="Analyze the sentiment of spoken words. Upload an audio file, record directly, or click an example below.",
    examples=examples_list,
    article="""
    ### How it Works
    This tool uses OpenAI's **Whisper Small** model to transcribe audio into text.
    Then, **TextBlob** is used to perform sentiment analysis on the resulting transcript.
    By using `type="numpy"`, the interface directly processes audio data, making it more reliable.
    """,
    theme='huggingface'
)

# Launch the interface
if __name__ == "__main__":
    demo.launch(mcp_server=True)