Spaces:
Runtime error
Runtime error
| import gradio as gr | |
| import numpy as np | |
| import os | |
| import time | |
| import torch | |
| from scipy.io import wavfile | |
| import soundfile as sf | |
| import datasets | |
| # Bark imports | |
| from bark import generate_audio, SAMPLE_RATE | |
| from bark.generation import preload_models, generate_text_semantic | |
| # Hugging Face Transformers | |
| from transformers import ( | |
| SpeechT5HifiGan, | |
| SpeechT5ForTextToSpeech, | |
| SpeechT5Processor | |
| ) | |
| class VoiceSynthesizer: | |
| def __init__(self): | |
| # Create working directory | |
| self.base_dir = os.path.dirname(os.path.abspath(__file__)) | |
| self.working_dir = os.path.join(self.base_dir, "working_files") | |
| os.makedirs(self.working_dir, exist_ok=True) | |
| # Store reference voice | |
| self.reference_voice = None | |
| # Initialize models dictionary | |
| self.models = { | |
| "bark": self._initialize_bark, | |
| "speecht5": self._initialize_speecht5 | |
| } | |
| # Default model | |
| self.current_model = "bark" | |
| # Initialize Bark models | |
| try: | |
| print("Attempting to load Bark models...") | |
| preload_models() | |
| print("Bark models loaded successfully.") | |
| except Exception as e: | |
| print(f"Bark model loading error: {e}") | |
| def _initialize_bark(self): | |
| """Bark model initialization (already done in __init__)""" | |
| return None | |
| def _initialize_speecht5(self): | |
| """Initialize SpeechT5 model from Hugging Face""" | |
| try: | |
| # Load SpeechT5 model and processor | |
| model = SpeechT5ForTextToSpeech.from_pretrained("microsoft/speecht5_tts") | |
| processor = SpeechT5Processor.from_pretrained("microsoft/speecht5_tts") | |
| vocoder = SpeechT5HifiGan.from_pretrained("microsoft/speecht5_hifigan") | |
| # Load speaker embeddings | |
| embeddings_dataset = datasets.load_dataset("Matthijs/cmu-arctic-xvectors", split="validation") | |
| speaker_embeddings = torch.tensor(embeddings_dataset[0]["xvector"]).unsqueeze(0) | |
| return { | |
| "model": model, | |
| "processor": processor, | |
| "vocoder": vocoder, | |
| "speaker_embeddings": speaker_embeddings | |
| } | |
| except Exception as e: | |
| print(f"SpeechT5 model loading error: {e}") | |
| return None | |
| def process_reference_audio(self, reference_audio): | |
| """Process and store reference audio for voice cloning""" | |
| try: | |
| # Gradio can pass audio in different formats | |
| if reference_audio is None: | |
| return "No audio provided" | |
| # Handle different input types | |
| if isinstance(reference_audio, tuple): | |
| # Gradio typically returns (sample_rate, audio_array) | |
| if len(reference_audio) == 2: | |
| sample_rate, audio_data = reference_audio | |
| else: | |
| audio_data = reference_audio[0] | |
| sample_rate = SAMPLE_RATE # Default to Bark sample rate | |
| elif isinstance(reference_audio, np.ndarray): | |
| audio_data = reference_audio | |
| sample_rate = SAMPLE_RATE | |
| else: | |
| return "Invalid audio format" | |
| # Ensure audio is numpy array | |
| audio_data = np.asarray(audio_data) | |
| # Handle multi-channel audio | |
| if audio_data.ndim > 1: | |
| audio_data = audio_data.mean(axis=1) | |
| # Trim or pad to standard length | |
| max_duration = 10 # 10 seconds | |
| max_samples = max_duration * sample_rate | |
| if len(audio_data) > max_samples: | |
| audio_data = audio_data[:max_samples] | |
| # Resample if necessary | |
| if sample_rate != SAMPLE_RATE: | |
| from scipy.signal import resample | |
| audio_data = resample(audio_data, int(len(audio_data) * SAMPLE_RATE / sample_rate)) | |
| # Save reference audio | |
| ref_filename = os.path.join(self.working_dir, "reference_voice.wav") | |
| sf.write(ref_filename, audio_data, SAMPLE_RATE) | |
| # Store reference voice | |
| self.reference_voice = ref_filename | |
| return "Reference voice processed successfully" | |
| except Exception as e: | |
| print(f"Reference audio processing error: {e}") | |
| import traceback | |
| traceback.print_exc() | |
| return f"Error processing reference audio: {str(e)}" | |
| def _generate_bark_speech(self, text, voice_preset=None): | |
| """Generate speech using Bark""" | |
| # Default Bark voice presets | |
| voice_presets = [ | |
| "v2/en_speaker_6", # Female | |
| "v2/en_speaker_3", # Male | |
| "v2/en_speaker_9", # Neutral | |
| ] | |
| # Prepare history prompt | |
| history_prompt = None | |
| # Check if a reference voice is available | |
| if self.reference_voice is not None: | |
| # Use saved reference voice file | |
| history_prompt = self.reference_voice | |
| # If no reference voice, use preset | |
| if history_prompt is None and voice_preset: | |
| # Extract the actual preset value | |
| if isinstance(voice_preset, str): | |
| # Remove any additional text in parentheses | |
| preset_value = voice_preset.split(' ')[0] | |
| history_prompt = preset_value if preset_value in voice_presets else voice_presets[0] | |
| else: | |
| history_prompt = voice_presets[0] | |
| # Generate audio with or without history prompt | |
| try: | |
| # Attempt generation with different approaches | |
| if history_prompt: | |
| try: | |
| audio_array = generate_audio( | |
| text, | |
| history_prompt=history_prompt | |
| ) | |
| except Exception as preset_error: | |
| print(f"Error with specific history prompt: {preset_error}") | |
| # Fallback to default generation | |
| audio_array = generate_audio(text) | |
| else: | |
| # Fallback to default generation | |
| audio_array = generate_audio(text) | |
| # Save generated audio | |
| filename = f"bark_speech_{int(time.time())}.wav" | |
| filepath = os.path.join(self.working_dir, filename) | |
| wavfile.write(filepath, SAMPLE_RATE, audio_array) | |
| return filepath, None | |
| except Exception as e: | |
| print(f"Bark speech generation error: {e}") | |
| import traceback | |
| traceback.print_exc() | |
| return None, f"Error in Bark speech generation: {str(e)}" | |
| def generate_speech(self, text, model_name=None, voice_preset=None): | |
| """Generate speech using selected model""" | |
| if not text or not text.strip(): | |
| return None, "Please enter some text to speak" | |
| # Use specified model or current model | |
| current_model = model_name or self.current_model | |
| try: | |
| if current_model == "bark": | |
| return self._generate_bark_speech(text, voice_preset) | |
| elif current_model == "speecht5": | |
| return self._generate_speecht5_speech(text, voice_preset) | |
| else: | |
| raise ValueError(f"Unsupported model: {current_model}") | |
| except Exception as e: | |
| print(f"Speech generation error: {e}") | |
| import traceback | |
| traceback.print_exc() | |
| return None, f"Error generating speech: {str(e)}" | |
| def _generate_speecht5_speech(self, text, speaker_id=None): | |
| """Generate speech using SpeechT5""" | |
| # Ensure model is initialized | |
| speecht5_models = self.models["speecht5"]() | |
| if not speecht5_models: | |
| return None, "SpeechT5 model not loaded" | |
| model = speecht5_models["model"] | |
| processor = speecht5_models["processor"] | |
| vocoder = speecht5_models["vocoder"] | |
| speaker_embeddings = speecht5_models["speaker_embeddings"] | |
| # Prepare inputs | |
| inputs = processor(text=text, return_tensors="pt") | |
| # Generate speech | |
| speech = model.generate_speech( | |
| inputs["input_ids"], | |
| speaker_embeddings | |
| ) | |
| # Convert to numpy array | |
| audio_array = speech.numpy() | |
| # Save generated audio | |
| filename = f"speecht5_speech_{int(time.time())}.wav" | |
| filepath = os.path.join(self.working_dir, filename) | |
| wavfile.write(filepath, 16000, audio_array) | |
| return filepath, None | |
| def create_interface(): | |
| synthesizer = VoiceSynthesizer() | |
| with gr.Blocks() as interface: | |
| gr.Markdown("# ๐๏ธ Advanced Voice Synthesis") | |
| with gr.Row(): | |
| with gr.Column(): | |
| gr.Markdown("## 1. Capture Reference Voice") | |
| reference_audio = gr.Audio(sources=["microphone", "upload"], type="numpy") | |
| process_ref_btn = gr.Button("Process Reference Voice") | |
| process_ref_output = gr.Textbox(label="Reference Voice Processing") | |
| with gr.Column(): | |
| gr.Markdown("## 2. Generate Speech") | |
| text_input = gr.Textbox(label="Enter Text to Speak") | |
| # Model Selection | |
| model_dropdown = gr.Dropdown( | |
| choices=[ | |
| "bark (Suno AI)", | |
| "speecht5 (Microsoft)" | |
| ], | |
| label="Select TTS Model", | |
| value="bark (Suno AI)" | |
| ) | |
| # Voice Preset Dropdowns | |
| with gr.Row(): | |
| bark_preset = gr.Dropdown( | |
| choices=[ | |
| "v2/en_speaker_6 (Female Voice)", | |
| "v2/en_speaker_3 (Male Voice)", | |
| "v2/en_speaker_9 (Neutral Voice)" | |
| ], | |
| label="Bark Voice Preset", | |
| value="v2/en_speaker_6 (Female Voice)", | |
| visible=True | |
| ) | |
| speecht5_preset = gr.Dropdown( | |
| choices=[ | |
| "Default Speaker" | |
| ], | |
| label="SpeechT5 Speaker", | |
| visible=False | |
| ) | |
| generate_btn = gr.Button("Generate Speech") | |
| audio_output = gr.Audio(label="Generated Speech") | |
| error_output = gr.Textbox(label="Errors", visible=True) | |
| # Process reference audio | |
| process_ref_btn.click( | |
| fn=synthesizer.process_reference_audio, | |
| inputs=reference_audio, | |
| outputs=process_ref_output | |
| ) | |
| # Dynamic model and preset visibility | |
| def update_model_visibility(model): | |
| if "bark" in model.lower(): | |
| return { | |
| bark_preset: gr.update(visible=True), | |
| speecht5_preset: gr.update(visible=False) | |
| } | |
| else: | |
| return { | |
| bark_preset: gr.update(visible=False), | |
| speecht5_preset: gr.update(visible=True) | |
| } | |
| model_dropdown.change( | |
| fn=update_model_visibility, | |
| inputs=model_dropdown, | |
| outputs=[bark_preset, speecht5_preset] | |
| ) | |
| # Speech generation logic | |
| def generate_speech_wrapper(text, model, bark_preset, speecht5_preset): | |
| # Map model name | |
| model_map = { | |
| "bark (Suno AI)": "bark", | |
| "speecht5 (Microsoft)": "speecht5" | |
| } | |
| # Select appropriate preset | |
| preset = bark_preset if "bark" in model else speecht5_preset | |
| # Extract preset value if it's a string with additional info | |
| if isinstance(preset, str): | |
| preset = preset.split(' ')[0] | |
| return synthesizer.generate_speech( | |
| text, | |
| model_name=model_map[model], | |
| voice_preset=preset | |
| ) | |
| generate_btn.click( | |
| fn=generate_speech_wrapper, | |
| inputs=[text_input, model_dropdown, bark_preset, speecht5_preset], | |
| outputs=[audio_output, error_output] | |
| ) | |
| return interface | |
| if __name__ == "__main__": | |
| interface = create_interface() | |
| interface.launch( | |
| share=False, | |
| debug=True, | |
| show_error=True, | |
| server_name='0.0.0.0', | |
| server_port=7860 | |
| ) |