Upload 5 files
Browse files- src/custom_interface.py +24 -0
- src/d.py +69 -0
- src/packages.txt +1 -0
- src/requirements.txt +9 -0
- src/streamlit_app.py +35 -39
src/custom_interface.py
ADDED
|
@@ -0,0 +1,24 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
# custom_interface.py for CommonAccent English Accent Classifier
|
| 2 |
+
# Downloaded from: https://huggingface.co/Jzuluaga/accent-id-commonaccent_xlsr-en-english/blob/main/custom_interface.py
|
| 3 |
+
# This file is required by the SpeechBrain foreign_class interface.
|
| 4 |
+
|
| 5 |
+
import torch
|
| 6 |
+
from speechbrain.pretrained.interfaces import Pretrained
|
| 7 |
+
|
| 8 |
+
class CustomEncoderWav2vec2Classifier(Pretrained):
|
| 9 |
+
MODULES_NEEDED = ["model", "mean_var_norm", "label_encoder"]
|
| 10 |
+
HPARAMS_NEEDED = ["sample_rate"]
|
| 11 |
+
|
| 12 |
+
def classify_file(self, path):
|
| 13 |
+
signal, fs = self.load_audio(path)
|
| 14 |
+
return self.classify_batch(signal, fs)
|
| 15 |
+
|
| 16 |
+
def classify_batch(self, signal, fs):
|
| 17 |
+
if fs != self.hparams.sample_rate:
|
| 18 |
+
signal = self.resample(signal, fs, self.hparams.sample_rate)
|
| 19 |
+
signal = self.modules.mean_var_norm(signal, torch.tensor([1]))
|
| 20 |
+
embeddings = self.modules.model.encode_batch(signal)
|
| 21 |
+
out_prob = self.modules.model.classify_batch(embeddings)
|
| 22 |
+
score, index = torch.max(out_prob, dim=1)
|
| 23 |
+
text_lab = self.hparams.label_encoder.decode_torch(index)
|
| 24 |
+
return out_prob, score, index, text_lab
|
src/d.py
ADDED
|
@@ -0,0 +1,69 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
import yt_dlp
|
| 2 |
+
from pydub import AudioSegment
|
| 3 |
+
import os
|
| 4 |
+
import librosa
|
| 5 |
+
import numpy as np
|
| 6 |
+
import matplotlib.pyplot as plt
|
| 7 |
+
import torchaudio
|
| 8 |
+
|
| 9 |
+
def download_and_extract_audio(video_url, output_audio_path="audio.wav"):
|
| 10 |
+
ydl_opts = {
|
| 11 |
+
'format': 'bestaudio/best',
|
| 12 |
+
'outtmpl': 'temp_audio.%(ext)s',
|
| 13 |
+
'quiet': True,
|
| 14 |
+
'postprocessors': [{
|
| 15 |
+
'key': 'FFmpegExtractAudio',
|
| 16 |
+
'preferredcodec': 'wav',
|
| 17 |
+
'preferredquality': '192',
|
| 18 |
+
}],
|
| 19 |
+
}
|
| 20 |
+
with yt_dlp.YoutubeDL(ydl_opts) as ydl:
|
| 21 |
+
ydl.download([video_url])
|
| 22 |
+
for ext in ['wav', 'mp3', 'm4a', 'webm']:
|
| 23 |
+
fname = f"temp_audio.{ext}"
|
| 24 |
+
if os.path.exists(fname):
|
| 25 |
+
if ext != 'wav':
|
| 26 |
+
audio = AudioSegment.from_file(fname)
|
| 27 |
+
audio.export(output_audio_path, format="wav")
|
| 28 |
+
os.remove(fname)
|
| 29 |
+
else:
|
| 30 |
+
os.rename(fname, output_audio_path)
|
| 31 |
+
return output_audio_path
|
| 32 |
+
raise FileNotFoundError("Audio extraction failed.")
|
| 33 |
+
|
| 34 |
+
def debug_audio(audio_path):
|
| 35 |
+
y, sr = librosa.load(audio_path, sr=None)
|
| 36 |
+
plt.figure(figsize=(10, 2))
|
| 37 |
+
plt.plot(np.linspace(0, len(y)/sr, num=len(y)), y)
|
| 38 |
+
plt.title('Extracted Audio Waveform')
|
| 39 |
+
plt.xlabel('Time (s)')
|
| 40 |
+
plt.ylabel('Amplitude')
|
| 41 |
+
plt.show()
|
| 42 |
+
|
| 43 |
+
def get_accent_classifier():
|
| 44 |
+
from speechbrain.pretrained.interfaces import foreign_class
|
| 45 |
+
if not hasattr(get_accent_classifier, "model"):
|
| 46 |
+
get_accent_classifier.model = foreign_class(
|
| 47 |
+
source="Jzuluaga/accent-id-commonaccent_xlsr-en-english",
|
| 48 |
+
pymodule_file="custom_interface.py",
|
| 49 |
+
classname="CustomEncoderWav2vec2Classifier"
|
| 50 |
+
)
|
| 51 |
+
return get_accent_classifier.model
|
| 52 |
+
|
| 53 |
+
def analyze_accent(audio_path):
|
| 54 |
+
accent_classifier = get_accent_classifier()
|
| 55 |
+
# The classifier expects a path to a wav file
|
| 56 |
+
out_prob, score, index, text_lab = accent_classifier.classify_file(audio_path)
|
| 57 |
+
accent = text_lab[0] if isinstance(text_lab, list) else text_lab
|
| 58 |
+
confidence = float(score[0]) if hasattr(score, '__getitem__') else float(score)
|
| 59 |
+
summary = f"Detected accent: {accent} with confidence {confidence:.2f}."
|
| 60 |
+
return accent, confidence, summary
|
| 61 |
+
|
| 62 |
+
if __name__ == "__main__":
|
| 63 |
+
video_url = input("Enter public video URL: ")
|
| 64 |
+
audio_path = download_and_extract_audio(video_url)
|
| 65 |
+
# debug_audio(audio_path) # Uncomment to listen and plot
|
| 66 |
+
accent, confidence, summary = analyze_accent(audio_path)
|
| 67 |
+
print(f"Accent: {accent}")
|
| 68 |
+
print(f"English Accent Confidence: {confidence}%")
|
| 69 |
+
print(f"Summary: {summary}")
|
src/packages.txt
ADDED
|
@@ -0,0 +1 @@
|
|
|
|
|
|
|
| 1 |
+
ffmpeg
|
src/requirements.txt
ADDED
|
@@ -0,0 +1,9 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
streamlit
|
| 2 |
+
speechbrain==0.5.14
|
| 3 |
+
torchaudio
|
| 4 |
+
transformers
|
| 5 |
+
yt-dlp
|
| 6 |
+
pydub
|
| 7 |
+
librosa
|
| 8 |
+
matplotlib
|
| 9 |
+
numpy
|
src/streamlit_app.py
CHANGED
|
@@ -1,40 +1,36 @@
|
|
| 1 |
-
import altair as alt
|
| 2 |
-
import numpy as np
|
| 3 |
-
import pandas as pd
|
| 4 |
import streamlit as st
|
| 5 |
-
|
| 6 |
-
|
| 7 |
-
|
| 8 |
-
|
| 9 |
-
|
| 10 |
-
|
| 11 |
-
|
| 12 |
-
|
| 13 |
-
|
| 14 |
-
""
|
| 15 |
-
|
| 16 |
-
|
| 17 |
-
|
| 18 |
-
|
| 19 |
-
|
| 20 |
-
|
| 21 |
-
|
| 22 |
-
|
| 23 |
-
|
| 24 |
-
|
| 25 |
-
|
| 26 |
-
|
| 27 |
-
|
| 28 |
-
|
| 29 |
-
"
|
| 30 |
-
|
| 31 |
-
|
| 32 |
-
|
| 33 |
-
|
| 34 |
-
|
| 35 |
-
|
| 36 |
-
|
| 37 |
-
|
| 38 |
-
|
| 39 |
-
|
| 40 |
-
))
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
import streamlit as st
|
| 2 |
+
import os
|
| 3 |
+
from d import download_and_extract_audio, analyze_accent, get_accent_classifier
|
| 4 |
+
|
| 5 |
+
st.title("English Accent Classifier")
|
| 6 |
+
st.write("""
|
| 7 |
+
Upload a public video URL (e.g., YouTube, Loom, or direct MP4 link). The tool will extract the audio, analyze the speaker’s accent, and provide a confidence score.
|
| 8 |
+
""")
|
| 9 |
+
|
| 10 |
+
# Show spinner and load model at startup
|
| 11 |
+
if "model_loaded" not in st.session_state:
|
| 12 |
+
with st.spinner("Loading models (this may take a while the first time)..."):
|
| 13 |
+
get_accent_classifier()
|
| 14 |
+
st.session_state["model_loaded"] = True
|
| 15 |
+
st.success("Model loaded!")
|
| 16 |
+
|
| 17 |
+
video_url = st.text_input("Enter public video URL:")
|
| 18 |
+
|
| 19 |
+
if st.button("Analyze Accent") and video_url:
|
| 20 |
+
with st.spinner("Downloading and extracting audio..."):
|
| 21 |
+
try:
|
| 22 |
+
audio_path = download_and_extract_audio(video_url)
|
| 23 |
+
except Exception as e:
|
| 24 |
+
st.error(f"Audio extraction failed: {e}")
|
| 25 |
+
st.stop()
|
| 26 |
+
st.success("Audio extracted successfully!")
|
| 27 |
+
st.audio(audio_path)
|
| 28 |
+
with st.spinner("Analyzing accent (downloading model if needed)..."):
|
| 29 |
+
try:
|
| 30 |
+
accent, confidence, summary = analyze_accent(audio_path)
|
| 31 |
+
except Exception as e:
|
| 32 |
+
st.error(f"Accent analysis failed: {e}")
|
| 33 |
+
st.stop()
|
| 34 |
+
st.markdown(f"**Accent:** {accent}")
|
| 35 |
+
st.markdown(f"**English Accent Confidence:** {confidence:.2f}%")
|
| 36 |
+
st.markdown(f"**Summary:** {summary}")
|
|
|