Spaces:
				
			
			
	
			
			
		Runtime error
		
	
	
	
			
			
	
	
	
	
		
		
		Runtime error
		
	| import os | |
| os.system("pip install git+https://github.com/openai/whisper.git") | |
| import gradio as gr | |
| import whisper | |
| from huggingface_hub import from_pretrained_keras | |
| from transformers import AutoTokenizer, AutoModelForSequenceClassification | |
| from transformers import pipeline | |
| from sklearn.preprocessing import StandardScaler | |
| import logging | |
| import librosa | |
| import numpy as np | |
| import pickle | |
| #call tokenizer and NLP model for text classification | |
| tokenizer = AutoTokenizer.from_pretrained("cardiffnlp/twitter-roberta-base-sentiment-latest") | |
| model_nlp = AutoModelForSequenceClassification.from_pretrained("cardiffnlp/twitter-roberta-base-sentiment-latest") | |
| # call whisper model for audio/speech processing | |
| model = whisper.load_model("small") | |
| # call model for audio emotions | |
| reloaded_model = from_pretrained_keras('jmparejaz/RAVDESS-CREMAD_AudioEmotionClassifier') | |
| # call scaler and decoder | |
| with open("scaler.pkl", "rb") as f: | |
| scaler = pickle.load(f) | |
| with open("encoder.pkl", "rb") as f: | |
| encoder = pickle.load(f) | |
| def inference_audio(audio): | |
| audio = whisper.load_audio(audio) | |
| audio = whisper.pad_or_trim(audio) | |
| mel = whisper.log_mel_spectrogram(audio).to(model.device) | |
| _, probs = model.detect_language(mel) | |
| options = whisper.DecodingOptions(fp16 = False) | |
| result = whisper.decode(model, mel, options) | |
| return result.text | |
| def inference_text(audio): | |
| text =inference_audio(audio) | |
| sentiment_task = pipeline("sentiment-analysis", model=model_nlp, tokenizer=tokenizer) | |
| res=sentiment_task(text)[0] | |
| return text,res['label'],res['score'] | |
| def extract_features(data): | |
| # ZCR | |
| result = np.array([]) | |
| zcr = np.mean(librosa.feature.zero_crossing_rate(y=data).T, axis=0) | |
| result=np.hstack((result, zcr)) # stacking horizontally | |
| # Chroma_stft | |
| stft = np.abs(librosa.stft(data)) | |
| chroma_stft = np.mean(librosa.feature.chroma_stft(S=stft, sr=sample_rate).T, axis=0) | |
| result = np.hstack((result, chroma_stft)) # stacking horizontally | |
| # MFCC | |
| mfcc = np.mean(librosa.feature.mfcc(y=data, sr=sample_rate).T, axis=0) | |
| result = np.hstack((result, mfcc)) # stacking horizontally | |
| # Root Mean Square Value | |
| rms = np.mean(librosa.feature.rms(y=data).T, axis=0) | |
| result = np.hstack((result, rms)) # stacking horizontally | |
| # MelSpectogram | |
| mel = np.mean(librosa.feature.melspectrogram(y=data, sr=sample_rate).T, axis=0) | |
| result = np.hstack((result, mel)) # stacking horizontally | |
| return result | |
| """ | |
| def audio_emotions(audio): | |
| sr,data = audio | |
| features_audio = extract_features(data) | |
| features_audio = np.array(features_audio) | |
| scaled_features=scaler.transform(features_audio) | |
| scaled_features = np.expand_dims(scaled_features, axis=2) | |
| prediction=reloaded_model.predict(scaled_features) | |
| y_pred = encoder.inverse_transform(prediction) | |
| return y_pred | |
| """ | |
| def main(audio): | |
| r1,r2,r3=inference_text(audio) | |
| #r3=audio_emotions(audio) | |
| return r1,r2,r3 | |
| audio = gr.Audio( | |
| label="Input Audio", | |
| show_label=False, | |
| source="microphone", | |
| type="filepath" | |
| ) | |
| app=gr.Interface(title="Sentiment Audio Analysis",fn=main,inputs=audio, outputs=["text","text","text"]).launch(debug = True) |