Spaces:

Lum4yx
/

mcp-sentiment

Sleeping

App Files Files Community

Lum4yx commited on Sep 20

Commit

b08b4ee

verified ·

1 Parent(s): c96cde1

Update app.py

Browse files

Files changed (1) hide show

app.py +67 -25

app.py CHANGED Viewed

@@ -1,6 +1,6 @@
 import gradio as gr
 from textblob import TextBlob
-from transformers import AutoModelForSpeechSeq2Seq, AutoProcessor, pipeline
 import torch
 import base64
 import numpy as np
@@ -8,17 +8,23 @@ import ffmpeg
 import os
 import glob # Imported to find example files
 device = "cuda:0" if torch.cuda.is_available() else "cpu"
 torch_dtype = torch.float16 if torch.cuda.is_available() else torch.float32
 model_id = "openai/whisper-small"
-model = AutoModelForSpeechSeq2Seq.from_pretrained(
-    model_id, torch_dtype=torch_dtype, low_cpu_mem_usage=True
 )
 model.to(device)
 processor = AutoProcessor.from_pretrained(model_id)
 pipe = pipeline(
     "automatic-speech-recognition",
     model=model,
@@ -31,7 +37,7 @@ pipe = pipeline(
 def sentiment_analysis(text: str) -> dict:
     """
-    Analyze the sentiment of the given text. (This function is unchanged)
     """
     blob = TextBlob(text)
     sentiment = blob.sentiment
@@ -46,65 +52,101 @@ def sentiment_analysis(text: str) -> dict:
 def process_audio(audio_path: str) -> dict:
     """
     Processes an audio file from a local path, transcribes it, and analyzes its sentiment.
-    Args:
-        audio_path (str): The file path to the audio file. Or a base64 string of the audio for a remote MCP server.
-    Returns:
-        dict: The sentiment analysis result or an error message.
     """
     if not audio_path or not os.path.exists(audio_path):
         return {"error": "Invalid or non-existent file path provided."}
     try:
-        # Use ffmpeg to read the audio file and convert it to a raw PCM buffer.
-        # The pipeline expects a 16kHz mono audio stream.
         out, _ = (
             ffmpeg
             .input(audio_path)
             .output('pipe:1', format='s16le', ac=1, ar=16000)
             .run(capture_stdout=True, capture_stderr=True)
         )
-        # Convert the raw PCM buffer to a NumPy array of 32-bit floats.
         audio_np = np.frombuffer(out, np.int16).astype(np.float32) / 32768.0
-        # Transcribe the audio from the NumPy array using the HF pipeline
         transcription_result = pipe(audio_np)
         transcript_text = transcription_result["text"]
     except Exception as e:
-        # Capture potential errors from ffmpeg or the model
-        return {"error": f"Failed to process audio: {str(e)}"}
-    # Perform sentiment analysis on the transcribed text
     return sentiment_analysis(transcript_text)
 example_files = (
-    glob.glob(os.path.join(examples_dir, "*.wav"))
 )
 examples_list = [[file] for file in example_files]
 demo = gr.Interface(
-    fn=process_audio, # The function to call
-    # The input is an Audio component that accepts file uploads or microphone input
     inputs=gr.Audio(type="filepath", label="Upload Audio File or Record"),
     outputs=gr.JSON(label="Analysis Result"),
     title="🎙️ Audio Sentiment Analysis (Whisper Small)",
     description="""
     Analyze the sentiment of spoken words.
-    Upload an audio file, record audio directly, or **click on an example below**.
-    The tool will transcribe the audio using `openai/whisper-small` and perform sentiment analysis on the text with TextBlob.
     """,
-    examples=examples_list, # Use the globbed list of examples here
     article="""
     ### How it Works
-    This tool uses a speech-to-text model to transcribe the audio, and then a sentiment analysis model to determine if the transcribed text is positive, negative, or neutral.
     """,
     theme='huggingface'
 )
 # Launch the interface and MCP server
 if __name__ == "__main__":
     # pip install gradio textblob "transformers[torch]" accelerate safetensors ffmpeg-python numpy
     demo.launch(mcp_server=True)

 import gradio as gr
 from textblob import TextBlob
+from transformers import AutoModelForSpeechSeqSeq, AutoProcessor, pipeline
 import torch
 import base64
 import numpy as np
 import os
 import glob # Imported to find example files
+# 1. Set up device and data type for optimized performance
 device = "cuda:0" if torch.cuda.is_available() else "cpu"
 torch_dtype = torch.float16 if torch.cuda.is_available() else torch.float32
+# 2. Define the model ID for the Whisper model
 model_id = "openai/whisper-small"
+# 3. Load the model from pretrained weights
+model = AutoModelForSpeechSeqSeq.from_pretrained(
+    model_id, torch_dtype=torch_dtype, low_cpu_mem_usage=True, use_safetensors=True
 )
 model.to(device)
+# 4. Load the processor which includes the feature extractor and tokenizer
 processor = AutoProcessor.from_pretrained(model_id)
+# 5. Create the ASR pipeline with the loaded components
 pipe = pipeline(
     "automatic-speech-recognition",
     model=model,
 def sentiment_analysis(text: str) -> dict:
     """
+    Analyze the sentiment of the given text.
     """
     blob = TextBlob(text)
     sentiment = blob.sentiment
 def process_audio(audio_path: str) -> dict:
     """
     Processes an audio file from a local path, transcribes it, and analyzes its sentiment.
     """
     if not audio_path or not os.path.exists(audio_path):
         return {"error": "Invalid or non-existent file path provided."}
     try:
         out, _ = (
             ffmpeg
             .input(audio_path)
             .output('pipe:1', format='s16le', ac=1, ar=16000)
             .run(capture_stdout=True, capture_stderr=True)
         )
         audio_np = np.frombuffer(out, np.int16).astype(np.float32) / 32768.0
+        transcription_result = pipe(audio_np)
+        transcript_text = transcription_result["text"]
+    except Exception as e:
+        return {"error": f"Failed to process audio file: {str(e)}"}
+    return sentiment_analysis(transcript_text)
+def process_base64_audio(base64_data_uri: str) -> dict:
+    """
+    Decodes a Base64 audio data URI, processes it in-memory, transcribes it, and analyzes its sentiment.
+    """
+    if not isinstance(base64_data_uri, str) or "base64," not in base64_data_uri:
+        return {"error": "Invalid or empty Base64 data URI provided."}
+    try:
+        _, encoded_data = base64_data_uri.split(',', 1)
+        audio_data = base64.b64decode(encoded_data)
+        out, _ = (
+            ffmpeg
+            .input('pipe:0')
+            .output('pipe:1', format='s16le', ac=1, ar=16000)
+            .run(input=audio_data, capture_stdout=True, capture_stderr=True)
+        )
+        audio_np = np.frombuffer(out, np.int16).astype(np.float32) / 32768.0
         transcription_result = pipe(audio_np)
         transcript_text = transcription_result["text"]
     except Exception as e:
+        return {"error": f"Failed to process Base64 audio: {str(e)}"}
     return sentiment_analysis(transcript_text)
+def analyze_audio_input(audio_input: str) -> dict:
+    """
+    Router function to handle both file paths and Base64 strings.
+    This allows the Gradio UI to use file uploads and the API to use Base64.
+    """
+    # Check if the input is a valid file path provided by the Gradio component
+    if audio_input and os.path.exists(audio_input):
+        return process_audio(audio_input)
+    # Otherwise, assume it's a Base64 string from an API call
+    elif isinstance(audio_input, str):
+        return process_base64_audio(audio_input)
+    else:
+        return {"error": f"Invalid input type: {type(audio_input)}"}
+# --- Code to find and load examples ---
+examples_dir = "examples"
+if not os.path.exists(examples_dir):
+    os.makedirs(examples_dir)
+    print(f"Created '{examples_dir}/' directory. Please add your audio examples there.")
 example_files = (
+    glob.glob(os.path.join(examples_dir, "*.wav")) +
+    glob.glob(os.path.join(examples_dir, "*.mp3")) +
+    glob.glob(os.path.join(examples_dir, "*.flac"))
 )
 examples_list = [[file] for file in example_files]
+# --- End of example loading ---
+# Create the Gradio interface
 demo = gr.Interface(
+    fn=analyze_audio_input, # Point to the main router function
     inputs=gr.Audio(type="filepath", label="Upload Audio File or Record"),
     outputs=gr.JSON(label="Analysis Result"),
     title="🎙️ Audio Sentiment Analysis (Whisper Small)",
     description="""
     Analyze the sentiment of spoken words.
+    **UI**: Upload an audio file, record directly, or click an example.
+    **API**: The endpoint also accepts a Base64 encoded audio data URI as input.
     """,
+    examples=examples_list,
     article="""
     ### How it Works
+    This tool uses a speech-to-text model (`openai/whisper-small`) to transcribe audio, then TextBlob analyzes the text sentiment.
+    The server can handle both local file paths (from the UI) and Base64 strings (from API calls).
     """,
     theme='huggingface'
 )
 # Launch the interface and MCP server
 if __name__ == "__main__":
+    # Ensure ffmpeg is installed on your system.
     # pip install gradio textblob "transformers[torch]" accelerate safetensors ffmpeg-python numpy
     demo.launch(mcp_server=True)