Spaces:

vogent
/

vogent-turn-demo

Running

vogent-turn-demo / app.py

Vignesh Varadarajan

Improve explainers of input fields

68a2833 13 days ago

5.03 kB

	import gradio as gr
	import os
	from pathlib import Path
	from vogent_turn.inference import TurnDetector
	import soundfile as sf
	import numpy as np


	def get_detector():
	"""Lazy load the detector to avoid initialization during import"""
	detector = TurnDetector(compile_model=False, warmup=False)
	return detector

	# Initialize the turn detector
	detector = get_detector()

	# Get all preset names from samples folder
	def get_presets():
	samples_dir = Path("samples")
	if not samples_dir.exists():
	return []
	presets = [d.name for d in samples_dir.iterdir() if d.is_dir()]
	return sorted(presets)

	# Load preset data
	def load_preset(preset_name):
	"""Load audio and text files from the selected preset"""
	if not preset_name:
	return None, "", ""

	preset_dir = Path("samples") / preset_name

	# Load audio
	audio_path = preset_dir / "audio.wav"
	audio_file = str(audio_path) if audio_path.exists() else None

	# Load text files
	prev_text = ""
	curr_text = ""

	prev_path = preset_dir / "prev.txt"
	if prev_path.exists():
	prev_text = prev_path.read_text().strip()

	text_path = preset_dir / "text.txt"
	if text_path.exists():
	curr_text = text_path.read_text().strip()

	return audio_file, prev_text, curr_text

	# Run inference
	def run_inference(audio_file, prev_text, curr_text):
	"""Run turn detection inference"""
	if audio_file is None:
	return "Error: No audio file provided"

	if curr_text is None:
	return "Error: a transcript of the audio must be provided"

	if prev_text is None:
	prev_text = ""

	try:
	# Load audio file
	audio, sr = sf.read(audio_file)

	# Convert to mono if stereo
	if len(audio.shape) > 1:
	audio = audio.mean(axis=1)

	# Ensure audio is float32
	audio = audio.astype(np.float32)

	# Run prediction with context
	result = detector.predict(
	audio,
	prev_line=prev_text if prev_text else None,
	curr_line=curr_text if curr_text else None,
	return_probs=True,
	sample_rate=sr,
	)

	# Format output
	is_endpoint = result['is_endpoint']
	prob_endpoint = result['prob_endpoint']
	prob_continue = result['prob_continue']

	output = f"""
	Turn Detection Result:

	- Prediction: {'Turn Complete (Endpoint)' if is_endpoint else 'Turn Incomplete (Continue)'}
	- Probability of Endpoint: {prob_endpoint:.4f}
	- Probability of Continue: {prob_continue:.4f}
	"""
	return output

	except Exception as e:
	return f"Error during inference: {str(e)}"

	# Get default preset and load its data
	presets = get_presets()
	default_preset = presets[0] if presets else None
	default_audio, default_prev_text, default_curr_text = load_preset(default_preset) if default_preset else (None, "", "")

	# Create Gradio interface
	with gr.Blocks(title="Vogent Turn Demo") as demo:
	gr.Markdown("# Vogent Turn Demo")
	gr.Markdown("Multimodal turn detection using audio and text context")
	gr.Markdown("""
	[GitHub](https://github.com/vogent/vogent-turn) \| [Technical Report](https://blog.vogent.ai/posts/voturn-80m-state-of-the-art-turn-detection-for-voice-agents) \| [Model Weights](http://huggingface.co/vogent/Vogent-Turn-80M)
	""")

	with gr.Row():
	with gr.Column():
	# Preset selector
	preset_dropdown = gr.Dropdown(
	choices=presets,
	label="Preset Samples",
	info="Select a preset to auto-fill the fields",
	value=default_preset
	)

	# Input fields
	prev_text_input = gr.Textbox(
	label="Previous Line (The previous line spoken in the dialog)",
	placeholder="Enter the previous line of dialog...",
	lines=2,
	value=default_prev_text
	)

	curr_text_input = gr.Textbox(
	label="Current Line (The transcript of the below audio, omit punctuation)",
	placeholder="Enter the current line being spoken...",
	lines=2,
	value=default_curr_text
	)

	audio_input = gr.Audio(
	label="Audio",
	type="filepath",
	value=default_audio
	)

	# Inference button
	inference_btn = gr.Button("Run Inference", variant="primary")

	with gr.Column():
	# Output
	output_text = gr.Markdown(label="Results")

	# Connect preset dropdown to load function
	preset_dropdown.change(
	fn=load_preset,
	inputs=[preset_dropdown],
	outputs=[audio_input, prev_text_input, curr_text_input]
	)

	# Connect inference button
	inference_btn.click(
	fn=run_inference,
	inputs=[audio_input, prev_text_input, curr_text_input],
	outputs=[output_text]
	)

	if __name__ == "__main__":
	demo.launch()