Spaces:
Running
on
Zero
Running
on
Zero
| # Configuration for inference-cli.py | |
| # --- Input Files and Text --- | |
| # Path or Hugging Face Hub ID (e.g., "hf://user/repo/model.safetensors") to the TTS model checkpoint. | |
| # This is the primary required setting. The script infers model type (DiT/UNetT) from this path. | |
| ckpt_path = "hf://Gregniuki/F5-tts_English_German_Polish/multi3/model_900000.pt" # Default used in script | |
| # Path to the reference audio file (WAV, MP3, etc.). Recommended < 10 seconds. | |
| ref_audio = "tests/ref_audio/test_en_1_ref_short.wav" | |
| # Text transcription of the reference audio. | |
| # If set to "", the script will attempt to transcribe ref_audio using Whisper. | |
| ref_text = "Some call me nature, others call me mother nature." | |
| # Text to be synthesized by the TTS model. | |
| gen_text = "I don't really care what you call me. I've been a silent spectator, watching species evolve, empires rise and fall. But always remember, I am mighty and enduring. Respect me and I'll nurture you; ignore me and you shall face the consequences." | |
| # Optional: Path to a UTF-8 encoded text file containing the text to synthesize. | |
| # If provided, this overrides the gen_text setting above. | |
| gen_file = "" | |
| # Path to the tokenizer.json file required by the model. | |
| tokenizer_path = "data/Emilia_ZH_EN_pinyin/tokenizer.json" # Default used in script | |
| # --- Output Settings --- | |
| # Directory where the output audio (.wav) and spectrogram (.png) will be saved. | |
| output_dir = "tests" | |
| # Base name for the output files (e.g., "my_speech" -> my_speech.wav, my_speech.png). | |
| output_name = "out" # Default: "out" | |
| # --- Language Settings --- | |
| # Language code for phonemizing the *reference* text (e.g., en-us, en-gb, de, pl, fr-fr). | |
| # Needs to match the language spoken in ref_audio / ref_text. See phonemizer docs for codes. | |
| ref_language = "en-us" # Default: "en-us" | |
| # Language code for phonemizing the *generated* text (gen_text / gen_file). | |
| # Needs to match the language you want the model to speak. | |
| language = "en-us" # Default: "en-us" | |
| # --- Inference Parameters --- | |
| # Speech speed multiplier. > 1.0 is faster, < 1.0 is slower. | |
| speed = 1.0 # Default: 1.0 | |
| # Number of Function Evaluations (sampling steps). Higher values may improve quality but increase time. | |
| nfe = 32 # Default: 32 | |
| # Classifier-Free Guidance strength. Higher values increase adherence to reference timbre but can reduce naturalness. | |
| cfg = 2.0 # Default: 2.0 | |
| # Sway sampling coefficient (experimental). Often -1.0 or disabled. | |
| sway = -1.0 # Default: -1.0 | |
| # --- Postprocessing --- | |
| # Duration (in seconds) for cross-fading between generated audio batches. 0 disables cross-fading. | |
| cross_fade = 0.15 # Default: 0.15 | |
| # Apply silence removal to the final generated audio using pydub. | |
| remove_silence = false # Default: false | |
| # --- System Settings --- | |
| # Optional: Hugging Face API token for downloading private models or high-rate limiting. | |
| # Can also be set via environment variable HUGGING_FACE_HUB_TOKEN. | |
| hf_token = "" # Default: "" (uses cached credentials or public access) | |
| # Optional: Specify the device ('cuda', 'cpu', 'mps'). If commented out or empty, defaults to auto-detection. | |
| # device = "cuda" | |
| # Optional: Specify the data type ('float16', 'bfloat16', 'float32'). If commented out or empty, defaults based on device. | |
| # dtype = "float16" |