Spaces:
Running
Running
| import gradio as gr | |
| import os | |
| from pathlib import Path | |
| from vogent_turn.inference import TurnDetector | |
| import soundfile as sf | |
| import numpy as np | |
| def get_detector(): | |
| """Lazy load the detector to avoid initialization during import""" | |
| detector = TurnDetector(compile_model=False, warmup=False) | |
| return detector | |
| # Initialize the turn detector | |
| detector = get_detector() | |
| # Get all preset names from samples folder | |
| def get_presets(): | |
| samples_dir = Path("samples") | |
| if not samples_dir.exists(): | |
| return [] | |
| presets = [d.name for d in samples_dir.iterdir() if d.is_dir()] | |
| return sorted(presets) | |
| # Load preset data | |
| def load_preset(preset_name): | |
| """Load audio and text files from the selected preset""" | |
| if not preset_name: | |
| return None, "", "" | |
| preset_dir = Path("samples") / preset_name | |
| # Load audio | |
| audio_path = preset_dir / "audio.wav" | |
| audio_file = str(audio_path) if audio_path.exists() else None | |
| # Load text files | |
| prev_text = "" | |
| curr_text = "" | |
| prev_path = preset_dir / "prev.txt" | |
| if prev_path.exists(): | |
| prev_text = prev_path.read_text().strip() | |
| text_path = preset_dir / "text.txt" | |
| if text_path.exists(): | |
| curr_text = text_path.read_text().strip() | |
| return audio_file, prev_text, curr_text | |
| # Run inference | |
| def run_inference(audio_file, prev_text, curr_text): | |
| """Run turn detection inference""" | |
| if audio_file is None: | |
| return "Error: No audio file provided" | |
| if curr_text is None: | |
| return "Error: a transcript of the audio must be provided" | |
| if prev_text is None: | |
| prev_text = "" | |
| try: | |
| # Load audio file | |
| audio, sr = sf.read(audio_file) | |
| # Convert to mono if stereo | |
| if len(audio.shape) > 1: | |
| audio = audio.mean(axis=1) | |
| # Ensure audio is float32 | |
| audio = audio.astype(np.float32) | |
| # Run prediction with context | |
| result = detector.predict( | |
| audio, | |
| prev_line=prev_text if prev_text else None, | |
| curr_line=curr_text if curr_text else None, | |
| return_probs=True, | |
| sample_rate=sr, | |
| ) | |
| # Format output | |
| is_endpoint = result['is_endpoint'] | |
| prob_endpoint = result['prob_endpoint'] | |
| prob_continue = result['prob_continue'] | |
| output = f""" | |
| **Turn Detection Result:** | |
| - **Prediction:** {'Turn Complete (Endpoint)' if is_endpoint else 'Turn Incomplete (Continue)'} | |
| - **Probability of Endpoint:** {prob_endpoint:.4f} | |
| - **Probability of Continue:** {prob_continue:.4f} | |
| """ | |
| return output | |
| except Exception as e: | |
| return f"Error during inference: {str(e)}" | |
| # Get default preset and load its data | |
| presets = get_presets() | |
| default_preset = presets[0] if presets else None | |
| default_audio, default_prev_text, default_curr_text = load_preset(default_preset) if default_preset else (None, "", "") | |
| # Create Gradio interface | |
| with gr.Blocks(title="Vogent Turn Demo") as demo: | |
| gr.Markdown("# Vogent Turn Demo") | |
| gr.Markdown("Multimodal turn detection using audio and text context") | |
| gr.Markdown(""" | |
| [GitHub](https://github.com/vogent/vogent-turn) | [Technical Report](https://blog.vogent.ai/posts/voturn-80m-state-of-the-art-turn-detection-for-voice-agents) | [Model Weights](http://huggingface.co/vogent/Vogent-Turn-80M) | |
| """) | |
| with gr.Row(): | |
| with gr.Column(): | |
| # Preset selector | |
| preset_dropdown = gr.Dropdown( | |
| choices=presets, | |
| label="Preset Samples", | |
| info="Select a preset to auto-fill the fields", | |
| value=default_preset | |
| ) | |
| # Input fields | |
| prev_text_input = gr.Textbox( | |
| label="Previous Line (The previous line spoken in the dialog)", | |
| placeholder="Enter the previous line of dialog...", | |
| lines=2, | |
| value=default_prev_text | |
| ) | |
| curr_text_input = gr.Textbox( | |
| label="Current Line (The transcript of the below audio, omit punctuation)", | |
| placeholder="Enter the current line being spoken...", | |
| lines=2, | |
| value=default_curr_text | |
| ) | |
| audio_input = gr.Audio( | |
| label="Audio", | |
| type="filepath", | |
| value=default_audio | |
| ) | |
| # Inference button | |
| inference_btn = gr.Button("Run Inference", variant="primary") | |
| with gr.Column(): | |
| # Output | |
| output_text = gr.Markdown(label="Results") | |
| # Connect preset dropdown to load function | |
| preset_dropdown.change( | |
| fn=load_preset, | |
| inputs=[preset_dropdown], | |
| outputs=[audio_input, prev_text_input, curr_text_input] | |
| ) | |
| # Connect inference button | |
| inference_btn.click( | |
| fn=run_inference, | |
| inputs=[audio_input, prev_text_input, curr_text_input], | |
| outputs=[output_text] | |
| ) | |
| if __name__ == "__main__": | |
| demo.launch() | |