Spaces:
Running
Running
| # app.py (for your new MeloTTS space) | |
| import gradio as gr | |
| import torch | |
| import io | |
| import os | |
| import numpy as np | |
| import soundfile as sf | |
| import base64 | |
| import logging | |
| # This command is important and should run at the start | |
| os.system('python -m unidic download') | |
| from melo.api import TTS | |
| # --- Setup Logging --- | |
| logging.basicConfig(level=logging.INFO, format='%(asctime)s - %(levelname)s - %(message)s') | |
| logger = logging.getLogger(__name__) | |
| # --- Configuration --- | |
| # We pre-configure everything here. | |
| LANGUAGE = 'KR' | |
| # NOTE: A speed of 0.1 is extremely slow. 0.8 is a good starting point. Adjust if needed. | |
| SPEED = 0.8 | |
| DEVICE = 'cuda' if torch.cuda.is_available() else 'cpu' | |
| SPEAKER_ID = 'KR' # Default Korean speaker | |
| # --- Load Model (this happens only once when the space starts) --- | |
| MODEL_INSTANCE = None | |
| try: | |
| logger.info(f"Loading MeloTTS model for language: {LANGUAGE} on device: {DEVICE}...") | |
| MODEL_INSTANCE = TTS(language=LANGUAGE, device=DEVICE) | |
| logger.info("MeloTTS model loaded successfully.") | |
| except Exception as e: | |
| logger.exception(f"FATAL: MeloTTS model initialization error: {e}") | |
| MODEL_INSTANCE = None | |
| def synthesize(text_to_synthesize): | |
| """ | |
| Takes text input and returns a base64 encoded WAV audio data URI string. | |
| """ | |
| if not MODEL_INSTANCE: | |
| raise gr.Error("TTS Model is not available. Cannot process request.") | |
| if not text_to_synthesize or not text_to_synthesize.strip(): | |
| # Create and return a silent audio data URI for empty input | |
| silent_audio = np.zeros(int(0.1 * 24000), dtype=np.int16) | |
| wav_buffer = io.BytesIO() | |
| sf.write(wav_buffer, silent_audio, 24000, format='WAV') | |
| wav_buffer.seek(0) | |
| wav_base64 = base64.b64encode(wav_buffer.read()).decode('utf-8') | |
| return f"data:audio/wav;base64,{wav_base64}" | |
| try: | |
| logger.info(f"Synthesizing for text: '{text_to_synthesize[:80]}...'") | |
| # Use an in-memory BytesIO object to hold the audio data | |
| wav_buffer = io.BytesIO() | |
| # Synthesize audio directly to the buffer | |
| MODEL_INSTANCE.tts_to_file( | |
| text_to_synthesize, | |
| MODEL_INSTANCE.hps.data.spk2id[SPEAKER_ID], | |
| wav_buffer, | |
| speed=SPEED, | |
| format='wav' | |
| ) | |
| # Reset buffer position to the beginning | |
| wav_buffer.seek(0) | |
| # Encode the bytes to base64 | |
| wav_base64 = base64.b64encode(wav_buffer.read()).decode('utf-8') | |
| logger.info("Synthesis complete.") | |
| # Return the data URI string our React app expects | |
| return f"data:audio/wav;base64,{wav_base64}" | |
| except Exception as e: | |
| logger.exception(f"TTS synthesis error: {e}") | |
| raise gr.Error(f"An error occurred during synthesis: {str(e)}") | |
| # --- Create and Launch the Gradio Interface --- | |
| # We create a pure API with no complex UI. This is fast and reliable. | |
| iface = gr.Interface( | |
| fn=synthesize, | |
| inputs=gr.Textbox(label="Text to Synthesize"), | |
| outputs="text", # The API will return a simple text string (our base64 URI) | |
| title="MeloTTS API", | |
| description="A simplified API for MeloTTS. Pre-configured for Korean at 0.8 speed.", | |
| api_name="synthesize" | |
| ) | |
| # The .queue() helps manage traffic and is recommended for public APIs. | |
| iface.queue().launch() |