Lum4yx commited on
Commit
b08b4ee
·
verified ·
1 Parent(s): c96cde1

Update app.py

Browse files
Files changed (1) hide show
  1. app.py +67 -25
app.py CHANGED
@@ -1,6 +1,6 @@
1
  import gradio as gr
2
  from textblob import TextBlob
3
- from transformers import AutoModelForSpeechSeq2Seq, AutoProcessor, pipeline
4
  import torch
5
  import base64
6
  import numpy as np
@@ -8,17 +8,23 @@ import ffmpeg
8
  import os
9
  import glob # Imported to find example files
10
 
 
11
  device = "cuda:0" if torch.cuda.is_available() else "cpu"
12
  torch_dtype = torch.float16 if torch.cuda.is_available() else torch.float32
13
 
 
14
  model_id = "openai/whisper-small"
15
 
16
- model = AutoModelForSpeechSeq2Seq.from_pretrained(
17
- model_id, torch_dtype=torch_dtype, low_cpu_mem_usage=True
 
18
  )
19
  model.to(device)
 
 
20
  processor = AutoProcessor.from_pretrained(model_id)
21
 
 
22
  pipe = pipeline(
23
  "automatic-speech-recognition",
24
  model=model,
@@ -31,7 +37,7 @@ pipe = pipeline(
31
 
32
  def sentiment_analysis(text: str) -> dict:
33
  """
34
- Analyze the sentiment of the given text. (This function is unchanged)
35
  """
36
  blob = TextBlob(text)
37
  sentiment = blob.sentiment
@@ -46,65 +52,101 @@ def sentiment_analysis(text: str) -> dict:
46
  def process_audio(audio_path: str) -> dict:
47
  """
48
  Processes an audio file from a local path, transcribes it, and analyzes its sentiment.
49
- Args:
50
- audio_path (str): The file path to the audio file. Or a base64 string of the audio for a remote MCP server.
51
-
52
- Returns:
53
- dict: The sentiment analysis result or an error message.
54
  """
55
  if not audio_path or not os.path.exists(audio_path):
56
  return {"error": "Invalid or non-existent file path provided."}
57
 
58
  try:
59
- # Use ffmpeg to read the audio file and convert it to a raw PCM buffer.
60
- # The pipeline expects a 16kHz mono audio stream.
61
  out, _ = (
62
  ffmpeg
63
  .input(audio_path)
64
  .output('pipe:1', format='s16le', ac=1, ar=16000)
65
  .run(capture_stdout=True, capture_stderr=True)
66
  )
67
-
68
- # Convert the raw PCM buffer to a NumPy array of 32-bit floats.
69
  audio_np = np.frombuffer(out, np.int16).astype(np.float32) / 32768.0
 
 
 
 
70
 
71
- # Transcribe the audio from the NumPy array using the HF pipeline
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
72
  transcription_result = pipe(audio_np)
73
  transcript_text = transcription_result["text"]
74
-
75
  except Exception as e:
76
- # Capture potential errors from ffmpeg or the model
77
- return {"error": f"Failed to process audio: {str(e)}"}
78
 
79
- # Perform sentiment analysis on the transcribed text
80
  return sentiment_analysis(transcript_text)
81
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
82
  example_files = (
83
- glob.glob(os.path.join(examples_dir, "*.wav"))
 
 
84
  )
85
  examples_list = [[file] for file in example_files]
 
86
 
87
 
 
88
  demo = gr.Interface(
89
- fn=process_audio, # The function to call
90
- # The input is an Audio component that accepts file uploads or microphone input
91
  inputs=gr.Audio(type="filepath", label="Upload Audio File or Record"),
92
  outputs=gr.JSON(label="Analysis Result"),
93
  title="🎙️ Audio Sentiment Analysis (Whisper Small)",
94
  description="""
95
  Analyze the sentiment of spoken words.
96
- Upload an audio file, record audio directly, or **click on an example below**.
97
- The tool will transcribe the audio using `openai/whisper-small` and perform sentiment analysis on the text with TextBlob.
98
  """,
99
- examples=examples_list, # Use the globbed list of examples here
100
  article="""
101
  ### How it Works
102
- This tool uses a speech-to-text model to transcribe the audio, and then a sentiment analysis model to determine if the transcribed text is positive, negative, or neutral.
 
103
  """,
104
  theme='huggingface'
105
  )
106
 
107
  # Launch the interface and MCP server
108
  if __name__ == "__main__":
 
109
  # pip install gradio textblob "transformers[torch]" accelerate safetensors ffmpeg-python numpy
110
  demo.launch(mcp_server=True)
 
1
  import gradio as gr
2
  from textblob import TextBlob
3
+ from transformers import AutoModelForSpeechSeqSeq, AutoProcessor, pipeline
4
  import torch
5
  import base64
6
  import numpy as np
 
8
  import os
9
  import glob # Imported to find example files
10
 
11
+ # 1. Set up device and data type for optimized performance
12
  device = "cuda:0" if torch.cuda.is_available() else "cpu"
13
  torch_dtype = torch.float16 if torch.cuda.is_available() else torch.float32
14
 
15
+ # 2. Define the model ID for the Whisper model
16
  model_id = "openai/whisper-small"
17
 
18
+ # 3. Load the model from pretrained weights
19
+ model = AutoModelForSpeechSeqSeq.from_pretrained(
20
+ model_id, torch_dtype=torch_dtype, low_cpu_mem_usage=True, use_safetensors=True
21
  )
22
  model.to(device)
23
+
24
+ # 4. Load the processor which includes the feature extractor and tokenizer
25
  processor = AutoProcessor.from_pretrained(model_id)
26
 
27
+ # 5. Create the ASR pipeline with the loaded components
28
  pipe = pipeline(
29
  "automatic-speech-recognition",
30
  model=model,
 
37
 
38
  def sentiment_analysis(text: str) -> dict:
39
  """
40
+ Analyze the sentiment of the given text.
41
  """
42
  blob = TextBlob(text)
43
  sentiment = blob.sentiment
 
52
  def process_audio(audio_path: str) -> dict:
53
  """
54
  Processes an audio file from a local path, transcribes it, and analyzes its sentiment.
 
 
 
 
 
55
  """
56
  if not audio_path or not os.path.exists(audio_path):
57
  return {"error": "Invalid or non-existent file path provided."}
58
 
59
  try:
 
 
60
  out, _ = (
61
  ffmpeg
62
  .input(audio_path)
63
  .output('pipe:1', format='s16le', ac=1, ar=16000)
64
  .run(capture_stdout=True, capture_stderr=True)
65
  )
 
 
66
  audio_np = np.frombuffer(out, np.int16).astype(np.float32) / 32768.0
67
+ transcription_result = pipe(audio_np)
68
+ transcript_text = transcription_result["text"]
69
+ except Exception as e:
70
+ return {"error": f"Failed to process audio file: {str(e)}"}
71
 
72
+ return sentiment_analysis(transcript_text)
73
+
74
+ def process_base64_audio(base64_data_uri: str) -> dict:
75
+ """
76
+ Decodes a Base64 audio data URI, processes it in-memory, transcribes it, and analyzes its sentiment.
77
+ """
78
+ if not isinstance(base64_data_uri, str) or "base64," not in base64_data_uri:
79
+ return {"error": "Invalid or empty Base64 data URI provided."}
80
+
81
+ try:
82
+ _, encoded_data = base64_data_uri.split(',', 1)
83
+ audio_data = base64.b64decode(encoded_data)
84
+ out, _ = (
85
+ ffmpeg
86
+ .input('pipe:0')
87
+ .output('pipe:1', format='s16le', ac=1, ar=16000)
88
+ .run(input=audio_data, capture_stdout=True, capture_stderr=True)
89
+ )
90
+ audio_np = np.frombuffer(out, np.int16).astype(np.float32) / 32768.0
91
  transcription_result = pipe(audio_np)
92
  transcript_text = transcription_result["text"]
 
93
  except Exception as e:
94
+ return {"error": f"Failed to process Base64 audio: {str(e)}"}
 
95
 
 
96
  return sentiment_analysis(transcript_text)
97
 
98
+ def analyze_audio_input(audio_input: str) -> dict:
99
+ """
100
+ Router function to handle both file paths and Base64 strings.
101
+ This allows the Gradio UI to use file uploads and the API to use Base64.
102
+ """
103
+ # Check if the input is a valid file path provided by the Gradio component
104
+ if audio_input and os.path.exists(audio_input):
105
+ return process_audio(audio_input)
106
+ # Otherwise, assume it's a Base64 string from an API call
107
+ elif isinstance(audio_input, str):
108
+ return process_base64_audio(audio_input)
109
+ else:
110
+ return {"error": f"Invalid input type: {type(audio_input)}"}
111
+
112
+
113
+ # --- Code to find and load examples ---
114
+ examples_dir = "examples"
115
+ if not os.path.exists(examples_dir):
116
+ os.makedirs(examples_dir)
117
+ print(f"Created '{examples_dir}/' directory. Please add your audio examples there.")
118
+
119
  example_files = (
120
+ glob.glob(os.path.join(examples_dir, "*.wav")) +
121
+ glob.glob(os.path.join(examples_dir, "*.mp3")) +
122
+ glob.glob(os.path.join(examples_dir, "*.flac"))
123
  )
124
  examples_list = [[file] for file in example_files]
125
+ # --- End of example loading ---
126
 
127
 
128
+ # Create the Gradio interface
129
  demo = gr.Interface(
130
+ fn=analyze_audio_input, # Point to the main router function
 
131
  inputs=gr.Audio(type="filepath", label="Upload Audio File or Record"),
132
  outputs=gr.JSON(label="Analysis Result"),
133
  title="🎙️ Audio Sentiment Analysis (Whisper Small)",
134
  description="""
135
  Analyze the sentiment of spoken words.
136
+ **UI**: Upload an audio file, record directly, or click an example.
137
+ **API**: The endpoint also accepts a Base64 encoded audio data URI as input.
138
  """,
139
+ examples=examples_list,
140
  article="""
141
  ### How it Works
142
+ This tool uses a speech-to-text model (`openai/whisper-small`) to transcribe audio, then TextBlob analyzes the text sentiment.
143
+ The server can handle both local file paths (from the UI) and Base64 strings (from API calls).
144
  """,
145
  theme='huggingface'
146
  )
147
 
148
  # Launch the interface and MCP server
149
  if __name__ == "__main__":
150
+ # Ensure ffmpeg is installed on your system.
151
  # pip install gradio textblob "transformers[torch]" accelerate safetensors ffmpeg-python numpy
152
  demo.launch(mcp_server=True)