Spaces:

Azgadel
/

voice-biometry-demo

Sleeping

App Files Files Community

Azgadel commited on 4 days ago

Commit

abe7eaf

verified ·

1 Parent(s): c0f6727

Upload 3 files

Browse files

Files changed (3) hide show

app.py +494 -0
best_embedding_model.pth +3 -0
requirements.txt +0 -0

app.py ADDED Viewed

	@@ -0,0 +1,494 @@

+import os
+os.environ['KMP_DUPLICATE_LIB_OK'] = 'TRUE'
+import streamlit as st
+import torch
+import torch.nn as nn
+import torch.nn.functional as F
+import soundfile as sf
+import torchaudio
+from transformers import Wav2Vec2FeatureExtractor, Wav2Vec2Model
+import numpy as np
+from pathlib import Path
+import json
+import tempfile
+# ============================================================
+# MODEL DEFINITION
+# ============================================================
+class Wav2Vec2ForSpeakerEmbedding(nn.Module):
+    def __init__(self, embedding_size=256):
+        super().__init__()
+        self.wav2vec2 = Wav2Vec2Model.from_pretrained("facebook/wav2vec2-base")
+        for param in self.wav2vec2.parameters():
+            param.requires_grad = False
+        self.projection = nn.Sequential(
+            nn.Linear(768, 512),
+            nn.ReLU(),
+            nn.Dropout(0.1),
+            nn.Linear(512, embedding_size)
+        )
+    def forward(self, input_values):
+        outputs = self.wav2vec2(input_values)
+        hidden_states = outputs.last_hidden_state
+        embeddings = torch.mean(hidden_states, dim=1)
+        embeddings = self.projection(embeddings)
+        embeddings = F.normalize(embeddings, p=2, dim=1)
+        return embeddings
+# ============================================================
+# AUDIO PROCESSING
+# ============================================================
+def process_audio(audio_file, feature_extractor, max_length=16000*3):
+    """Process uploaded audio file"""
+    try:
+        # Save uploaded file temporarily
+        with tempfile.NamedTemporaryFile(delete=False, suffix='.wav') as tmp_file:
+            tmp_file.write(audio_file.getvalue())
+            tmp_path = tmp_file.name
+        # Load audio
+        waveform, sr = sf.read(tmp_path, dtype='float32')
+        waveform = torch.from_numpy(waveform)
+        # Convert to mono
+        if len(waveform.shape) > 1:
+            waveform = torch.mean(waveform, dim=-1)
+        # Resample to 16kHz
+        if sr != 16000:
+            resampler = torchaudio.transforms.Resample(sr, 16000)
+            waveform = resampler(waveform)
+        # Take middle chunk
+        if len(waveform) > max_length:
+            start = (len(waveform) - max_length) // 2
+            waveform = waveform[start:start + max_length]
+        elif len(waveform) < max_length:
+            padding = max_length - len(waveform)
+            waveform = torch.nn.functional.pad(waveform, (0, padding))
+        # Normalize
+        if waveform.abs().max() > 0:
+            waveform = waveform / waveform.abs().max()
+        # Extract features
+        inputs = feature_extractor(
+            waveform.numpy(),
+            sampling_rate=16000,
+            return_tensors="pt"
+        )
+        # Cleanup
+        os.unlink(tmp_path)
+        return inputs.input_values, waveform.numpy(), sr
+    except Exception as e:
+        st.error(f"Error processing audio: {e}")
+        return None, None, None
+def get_embedding(model, audio_file, feature_extractor, device):
+    """Extract embedding from audio file"""
+    inputs, waveform, sr = process_audio(audio_file, feature_extractor)
+    if inputs is None:
+        return None
+    model.eval()
+    with torch.no_grad():
+        inputs = inputs.to(device)
+        embedding = model(inputs)
+    return embedding.cpu().numpy()
+# ============================================================
+# ENROLLMENT DATABASE
+# ============================================================
+class EnrollmentDB:
+    def __init__(self, db_path='enrollments.json'):
+        self.db_path = db_path
+        self.load_db()
+    def load_db(self):
+        if os.path.exists(self.db_path):
+            with open(self.db_path, 'r') as f:
+                data = json.load(f)
+                self.enrollments = {k: np.array(v) for k, v in data.items()}
+        else:
+            self.enrollments = {}
+    def save_db(self):
+        data = {k: v.tolist() for k, v in self.enrollments.items()}
+        with open(self.db_path, 'w') as f:
+            json.dump(data, f)
+    def enroll(self, name, embedding):
+        self.enrollments[name] = embedding
+        self.save_db()
+    def verify(self, embedding, threshold=0.75):
+        """
+        Verify against all enrolled users
+        Returns: (best_match_name, similarity_score, is_verified)
+        """
+        if not self.enrollments:
+            return None, 0.0, False
+        best_match = None
+        best_score = -1.0
+        embedding_tensor = torch.from_numpy(embedding)
+        for name, enrolled_emb in self.enrollments.items():
+            enrolled_tensor = torch.from_numpy(enrolled_emb)
+            similarity = F.cosine_similarity(embedding_tensor, enrolled_tensor, dim=1).item()
+            if similarity > best_score:
+                best_score = similarity
+                best_match = name
+        is_verified = best_score >= threshold
+        return best_match, best_score, is_verified
+    def get_all_users(self):
+        return list(self.enrollments.keys())
+    def remove_user(self, name):
+        if name in self.enrollments:
+            del self.enrollments[name]
+            self.save_db()
+            return True
+        return False
+# ============================================================
+# STREAMLIT APP
+# ============================================================
+@st.cache_resource
+def load_model():
+    """Load model once and cache it"""
+    device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')
+    model = Wav2Vec2ForSpeakerEmbedding(embedding_size=256).to(device)
+    checkpoint = torch.load('best_embedding_model.pth', map_location=device)
+    model.load_state_dict(checkpoint['model_state_dict'])
+    model.eval()
+    feature_extractor = Wav2Vec2FeatureExtractor.from_pretrained("facebook/wav2vec2-base")
+    return model, feature_extractor, device
+def main():
+    st.set_page_config(
+        page_title="Voice Biometry Demo",
+        page_icon="🎤",
+        layout="wide"
+    )
+    # Custom CSS
+    st.markdown("""
+        <style>
+        .big-font {
+            font-size:20px !important;
+            font-weight: bold;
+        }
+        .success-box {
+            padding: 20px;
+            border-radius: 10px;
+            background-color: #d4edda;
+            border: 2px solid #28a745;
+            color: #155724;
+        }
+        .failure-box {
+            padding: 20px;
+            border-radius: 10px;
+            background-color: #f8d7da;
+            border: 2px solid #dc3545;
+            color: #721c24;
+        }
+        .info-box {
+            padding: 20px;
+            border-radius: 10px;
+            background-color: #d1ecf1;
+            border: 2px solid #17a2b8;
+            color: #0c5460;
+        }
+        </style>
+    """, unsafe_allow_html=True)
+    # Header
+    st.title("Voice Biometry System - Proof of Concept")
+    st.markdown("### Finetuned Wav2Vec 2.0")
+    # Load model
+    with st.spinner("Loading model..."):
+        model, feature_extractor, device = load_model()
+    # Initialize database
+    db = EnrollmentDB()
+    # Sidebar - Configuration
+    st.sidebar.header("⚙️ Configuration")
+    threshold = st.sidebar.slider(
+        "Verification Threshold",
+        min_value=0.5,
+        max_value=0.95,
+        value=0.75,
+        step=0.05,
+        help="Higher = more strict verification"
+    )
+    st.sidebar.markdown("---")
+    st.sidebar.header("📊 System Stats")
+    st.sidebar.metric("Enrolled Users", len(db.get_all_users()))
+    st.sidebar.metric("Model Accuracy", "76%")
+    st.sidebar.metric("AUC Score", "0.82")
+    # Enrolled users list
+    if db.get_all_users():
+        st.sidebar.markdown("---")
+        st.sidebar.header("👥 Enrolled Users")
+        for user in db.get_all_users():
+            col1, col2 = st.sidebar.columns([3, 1])
+            col1.write(f"• {user}")
+            if col2.button("🗑️", key=f"del_{user}"):
+                db.remove_user(user)
+                st.rerun()
+    # Main tabs
+    tab1, tab2, tab3 = st.tabs(["📝 Enrollment", "✅ Verification", "ℹ️ About"])
+    # ============================================================
+    # TAB 1: ENROLLMENT
+    # ============================================================
+    with tab1:
+        st.header("Enroll a New User")
+        st.markdown("Upload a voice recording to register a new user in the system.")
+        col1, col2 = st.columns([2, 1])
+        with col1:
+            enroll_name = st.text_input(
+                "User Name",
+                placeholder="Enter name (e.g., Abdou Diop)",
+                help="This name will be used to identify the speaker"
+            )
+            enroll_audio = st.file_uploader(
+                "Upload Voice Recording",
+                type=['wav', 'mp3', 'flac', 'ogg'],
+                help="Upload a clear voice recording (3-20 seconds recommended)",
+                key="enroll"
+            )
+        with col2:
+            st.info("""
+            **Enrollment Tips:**
+            - Use clear audio
+            - 3-20 seconds long
+            - Minimal background noise
+            - Normal speaking voice
+            """)
+        if st.button("🎯 Enroll User", type="primary", disabled=(not enroll_name or not enroll_audio)):
+            with st.spinner(f"Processing enrollment for {enroll_name}..."):
+                # Check if user already exists
+                if enroll_name in db.get_all_users():
+                    st.warning(f"⚠️ User '{enroll_name}' already exists. Please use a different name or remove the existing user first.")
+                else:
+                    # Get embedding
+                    embedding = get_embedding(model, enroll_audio, feature_extractor, device)
+                    if embedding is not None:
+                        # Save enrollment
+                        db.enroll(enroll_name, embedding)
+                        st.markdown(f"""
+                        <div class="success-box">
+                            <h3>✅ Enrollment Successful!</h3>
+                            <p><strong>{enroll_name}</strong> has been enrolled in the system.</p>
+                            <p>Total enrolled users: {len(db.get_all_users())}</p>
+                        </div>
+                        """, unsafe_allow_html=True)
+                        #st.balloons()
+                    else:
+                        st.error("❌ Failed to process audio. Please try again with a different recording.")
+    # ============================================================
+    # TAB 2: VERIFICATION
+    # ============================================================
+    with tab2:
+        st.header("Verify User Identity")
+        st.markdown("Upload a voice recording to verify against enrolled users.")
+        if not db.get_all_users():
+            st.warning("⚠️ No users enrolled yet. Please enroll at least one user first.")
+        else:
+            col1, col2 = st.columns([2, 1])
+            with col1:
+                verify_audio = st.file_uploader(
+                    "Upload Voice Recording for Verification",
+                    type=['wav', 'mp3', 'flac', 'ogg'],
+                    help="Upload a voice recording from a speaker you want to verify",
+                    key="verify"
+                )
+            with col2:
+                st.info(f"""
+                **Verification Info:**
+                - {len(db.get_all_users())} users enrolled
+                - Threshold: {threshold:.2f}
+                - Model: Wav2Vec 2.0
+                """)
+            if st.button("🔍 Verify Identity", type="primary", disabled=(not verify_audio)):
+                with st.spinner("Analyzing voice..."):
+                    # Get embedding
+                    embedding = get_embedding(model, verify_audio, feature_extractor, device)
+                    if embedding is not None:
+                        # Verify
+                        match_name, similarity, is_verified = db.verify(embedding, threshold)
+                        # Display results
+                        st.markdown("---")
+                        if is_verified:
+                            st.markdown(f"""
+                            <div class="success-box">
+                                <h2>✅ VERIFICATION SUCCESSFUL</h2>
+                                <h3>Identified as: {match_name}</h3>
+                                <p style="font-size: 18px;">Confidence Score: <strong>{similarity:.1%}</strong></p>
+                            </div>
+                            """, unsafe_allow_html=True)
+                            st.success(f"🎉 Welcome back, {match_name}!")
+                        else:
+                            st.markdown(f"""
+                            <div class="failure-box">
+                                <h2>❌ VERIFICATION FAILED</h2>
+                                <p>Closest match: <strong>{match_name}</strong></p>
+                                <p>Similarity: <strong>{similarity:.1%}</strong></p>
+                                <p>Threshold required: <strong>{threshold:.1%}</strong></p>
+                                <p><em>This speaker is not recognized in the system.</em></p>
+                            </div>
+                            """, unsafe_allow_html=True)
+                        # Show all scores
+                        with st.expander("📊 See detailed scores for all enrolled users"):
+                            st.markdown("### Similarity Scores")
+                            scores = []
+                            embedding_tensor = torch.from_numpy(embedding)
+                            for name, enrolled_emb in db.enrollments.items():
+                                enrolled_tensor = torch.from_numpy(enrolled_emb)
+                                sim = F.cosine_similarity(embedding_tensor, enrolled_tensor, dim=1).item()
+                                scores.append({
+                                    'User': name,
+                                    'Similarity': f"{sim:.1%}",
+                                    'Status': '✅ Match' if sim >= threshold else '❌ No match'
+                                })
+                            # Sort by similarity
+                            scores.sort(key=lambda x: x['Similarity'], reverse=True)
+                            import pandas as pd
+                            df = pd.DataFrame(scores)
+                            st.dataframe(df, use_container_width=True, hide_index=True)
+                    else:
+                        st.error("❌ Failed to process audio. Please try again with a different recording.")
+    # ============================================================
+    # TAB 3: ABOUT
+    # ============================================================
+    with tab3:
+        st.header("About This System")
+        col1, col2 = st.columns(2)
+        with col1:
+            st.markdown("""
+            ### 🎯 Technology
+            **Model Architecture:**
+            - Base: Wav2Vec 2.0 (Facebook AI)
+            - Finetuned on 247 speakers
+            - 1035 voice samples (telephone quality, 8kHz)
+            - Embedding dimension: 256
+            **Training Details:**
+            - Loss: Supervised Contrastive Learning
+            - Framework: PyTorch + Transformers
+            - Training time: ~50 epochs
+            - Hardware: NVIDIA RTX 3050
+            """)
+        with col2:
+            st.markdown("""
+            ### 📊 Performance Metrics
+            **Evaluation Results:**
+            - **Accuracy:** 76%
+            - **AUC Score:** 0.82
+            - **True Positive Rate:** 79%
+            - **False Positive Rate:** 27%
+            **Test Set:**
+            - 1000 verification pairs
+            - 500 same-speaker pairs
+            - 500 different-speaker pairs
+            """)
+        st.markdown("---")
+        st.markdown("""
+        ### 🔧 How It Works
+        1. **Enrollment Phase:**
+           - User uploads voice recording
+           - System extracts 256-dimensional embedding
+           - Embedding stored in database with user name
+        2. **Verification Phase:**
+           - Unknown voice recording uploaded
+           - System extracts embedding
+           - Computes cosine similarity with all enrolled users
+           - Returns match if similarity exceeds threshold
+        3. **Matching Algorithm:**
+           - Cosine similarity between embeddings
+           - Range: -1 (opposite) to +1 (identical)
+           - Typical same-speaker: 0.75-0.95
+           - Typical different-speaker: 0.30-0.70
+        """)
+        st.markdown("---")
+        st.info("""
+        **Note:** This is a proof of concept system. For production deployment, consider:
+        - Larger training dataset (10-20 samples per speaker)
+        - Better base model (WavLM for noisy conditions)
+        - Anti-spoofing measures
+        - Liveness detection
+        - Multi-enrollment (average multiple recordings per user)
+        """)
+if __name__ == "__main__":
+    main()

best_embedding_model.pth ADDED Viewed

	@@ -0,0 +1,3 @@

+version https://git-lfs.github.com/spec/v1
+oid sha256:3312a4527b3bea45dc377a9a8dacf0f8421e8a8597947338b49140c0bc2e35e4
+size 379678794

requirements.txt ADDED Viewed

Binary file (176 Bytes). View file