Spaces:
Sleeping
Sleeping
| import gradio as gr | |
| from textblob import TextBlob | |
| from transformers import AutoModelForSpeechSeq2Seq, AutoProcessor, pipeline | |
| import torch | |
| import base64 | |
| import numpy as np | |
| import ffmpeg | |
| # 1. Set up device and data type for optimized performance | |
| device = "cuda:0" if torch.cuda.is_available() else "cpu" | |
| torch_dtype = torch.float16 if torch.cuda.is_available() else torch.float32 | |
| # 2. Define the model ID for the large Whisper model | |
| model_id = "openai/whisper-small" | |
| # 3. Load the model from pretrained weights | |
| model = AutoModelForSpeechSeq2Seq.from_pretrained( | |
| model_id, torch_dtype=torch_dtype, low_cpu_mem_usage=True, use_safetensors=True | |
| ) | |
| model.to(device) | |
| # 4. Load the processor which includes the feature extractor and tokenizer | |
| processor = AutoProcessor.from_pretrained(model_id) | |
| # 5. Create the ASR pipeline with the loaded components | |
| pipe = pipeline( | |
| "automatic-speech-recognition", | |
| model=model, | |
| tokenizer=processor.tokenizer, | |
| feature_extractor=processor.feature_extractor, | |
| max_new_tokens=128, | |
| torch_dtype=torch_dtype, | |
| device=device, | |
| ) | |
| def sentiment_analysis(text: str) -> dict: | |
| """ | |
| Analyze the sentiment of the given text. (This function is unchanged) | |
| """ | |
| blob = TextBlob(text) | |
| sentiment = blob.sentiment | |
| return { | |
| "transcript": text, | |
| "polarity": round(sentiment.polarity, 2), | |
| "subjectivity": round(sentiment.subjectivity, 2), | |
| "assessment": "positive" if sentiment.polarity > 0 else "negative" if sentiment.polarity < 0 else "neutral" | |
| } | |
| def process_base64_audio(base64_data_uri: str) -> dict: | |
| """ | |
| Decodes a Base64 audio data URI, processes it in-memory, | |
| transcribes it using a Hugging Face Whisper pipeline, and then analyzes its sentiment. | |
| Args: | |
| base64_data_uri (str): A string in data URI format (e.g., "data:audio/wav;base64,UklGRi..."). | |
| Returns: | |
| dict: The sentiment analysis result or an error message. | |
| """ | |
| if not base64_data_uri or "base64," not in base64_data_uri: | |
| return {"error": "Invalid or empty Base64 data URI provided."} | |
| try: | |
| # Parse the data URI to extract the Base64 encoded data | |
| _, encoded_data = base64_data_uri.split(',', 1) | |
| # Decode the Base64 string into binary audio data | |
| audio_data = base64.b64decode(encoded_data) | |
| # Use ffmpeg to convert the in-memory audio data to a raw PCM buffer. | |
| # The pipeline expects a 16kHz mono audio stream. | |
| out, _ = ( | |
| ffmpeg | |
| .input('pipe:0') | |
| .output('pipe:1', format='s16le', ac=1, ar=16000) | |
| .run(input=audio_data, capture_stdout=True, capture_stderr=True) | |
| ) | |
| # Convert the raw PCM buffer to a NumPy array of 32-bit floats. | |
| audio_np = np.frombuffer(out, np.int16).astype(np.float32) / 32768.0 | |
| # Transcribe the audio from the NumPy array using the HF pipeline | |
| transcription_result = pipe(audio_np) | |
| transcript_text = transcription_result["text"] | |
| except Exception as e: | |
| # Capture potential errors from ffmpeg or the model | |
| return {"error": f"Failed to process audio: {str(e)}"} | |
| # Perform sentiment analysis on the transcribed text | |
| return sentiment_analysis(transcript_text) | |
| # Create the Gradio interface with the Hugging Face theme | |
| demo = gr.Interface( | |
| fn=process_base64_audio, | |
| # The input remains a Textbox to accept the raw Base64 string from the API client | |
| inputs=gr.Textbox(lines=5, placeholder="Paste your Base64 encoded audio data URI here...", label="Base64 Audio Input"), | |
| outputs=gr.JSON(label="Analysis Result"), | |
| title="๐๏ธ Audio Sentiment Analysis (Whisper Large v3)", | |
| description=""" | |
| Analyze the sentiment of spoken words. | |
| This tool accepts a **Base64 encoded audio data URI**, transcribes the audio in-memory using the `openai/whisper-large-v3` model, | |
| and performs sentiment analysis on the text with TextBlob. | |
| """, | |
| examples=[ | |
| ["data:audio/wav;base64,UklGRiQ...<placeholder_for_a_short_positive_clip>"], | |
| ["data:audio/wav;base64,UklGRiQ...<placeholder_for_a_short_negative_clip>"] | |
| ], | |
| article=""" | |
| ### How to get a Base64 Audio URI? | |
| You can use an online converter or a script (like the provided `test_client.py`) to convert a short audio file (e.g., .wav or .mp3) into a Base64 data URI. | |
| The format must be `data:audio/[format];base64,[encoded_string]`. | |
| """, | |
| theme='huggingface' # This applies the new theme | |
| ) | |
| # Launch the interface and MCP server | |
| if __name__ == "__main__": | |
| # You will need to have ffmpeg installed on your system for this to work. | |
| # You also need to install the required python packages. This model is large and requires significant resources. | |
| # pip install gradio textblob "transformers[torch]" accelerate safetensors ffmpeg-python numpy | |
| demo.launch(mcp_server=True) | |