Spaces:
Running
Running
| { | |
| "base_config": "config/comosvc.json", | |
| "model_type": "DiffComoSVC", | |
| "dataset": [ | |
| "m4singer", | |
| "opencpop", | |
| "opensinger", | |
| "svcc", | |
| "vctk" | |
| ], | |
| "dataset_path": { | |
| // TODO: Fill in your dataset path | |
| "m4singer": "[M4Singer dataset path]", | |
| "opencpop": "[Opencpop dataset path]", | |
| "opensinger": "[OpenSinger dataset path]", | |
| "svcc": "[SVCC dataset path]", | |
| "vctk": "[VCTK dataset path]" | |
| }, | |
| // TODO: Fill in the output log path | |
| "log_dir": "[Your path to save logs and checkpoints]", | |
| "preprocess": { | |
| // TODO: Fill in the output data path | |
| "processed_dir": "[Your path to save processed data]", | |
| // Config for features extraction | |
| "extract_mel": true, | |
| "extract_pitch": true, | |
| "extract_energy": true, | |
| "extract_whisper_feature": true, | |
| "extract_contentvec_feature": true, | |
| "extract_wenet_feature": false, | |
| "whisper_batch_size": 30, // decrease it if your GPU is out of memory | |
| "contentvec_batch_size": 1, | |
| // Fill in the content-based pretrained model's path | |
| "contentvec_file": "pretrained/contentvec/checkpoint_best_legacy_500.pt", | |
| "wenet_model_path": "pretrained/wenet/20220506_u2pp_conformer_exp/final.pt", | |
| "wenet_config": "pretrained/wenet/20220506_u2pp_conformer_exp/train.yaml", | |
| "whisper_model": "medium", | |
| "whisper_model_path": "pretrained/whisper/medium.pt", | |
| // Config for features usage | |
| "use_mel": true, | |
| "use_min_max_norm_mel": true, | |
| "use_frame_pitch": true, | |
| "use_frame_energy": true, | |
| "use_spkid": true, | |
| "use_whisper": true, | |
| "use_contentvec": true, | |
| "use_wenet": false, | |
| "n_mel": 100, | |
| "sample_rate": 24000 | |
| }, | |
| "model": { | |
| "teacher_model_path":"[Your_teacher_model_checkpoint].bin", | |
| "condition_encoder": { | |
| // Config for features usage | |
| "use_whisper": true, | |
| "use_contentvec": true, | |
| "use_wenet": false, | |
| "whisper_dim": 1024, | |
| "contentvec_dim": 256, | |
| "wenet_dim": 512, | |
| "use_singer_encoder": false, | |
| "pitch_min": 50, | |
| "pitch_max": 1100 | |
| }, | |
| "comosvc":{ | |
| "distill": false, | |
| // conformer encoder | |
| "input_dim": 384, | |
| "output_dim": 100, | |
| "n_heads": 2, | |
| "n_layers": 6, | |
| "filter_channels":512, | |
| "dropout":0.1, | |
| // karras diffusion | |
| "P_mean": -1.2, | |
| "P_std": 1.2, | |
| "sigma_data": 0.5, | |
| "sigma_min": 0.002, | |
| "sigma_max": 80, | |
| "rho": 7, | |
| "n_timesteps": 40, | |
| }, | |
| "diffusion": { | |
| // Diffusion steps encoder | |
| "step_encoder": { | |
| "dim_raw_embedding": 128, | |
| "dim_hidden_layer": 512, | |
| "activation": "SiLU", | |
| "num_layer": 2, | |
| "max_period": 10000 | |
| }, | |
| // Diffusion decoder | |
| "model_type": "bidilconv", | |
| // bidilconv, unet2d, TODO: unet1d | |
| "bidilconv": { | |
| "base_channel": 384, | |
| "n_res_block": 20, | |
| "conv_kernel_size": 3, | |
| "dilation_cycle_length": 4, | |
| // specially, 1 means no dilation | |
| "conditioner_size": 100 | |
| } | |
| } | |
| }, | |
| "train": { | |
| "batch_size": 64, | |
| "gradient_accumulation_step": 1, | |
| "max_epoch": -1, // -1 means no limit | |
| "save_checkpoint_stride": [ | |
| 50, | |
| 50 | |
| ], | |
| "keep_last": [ | |
| 5, | |
| -1 | |
| ], | |
| "run_eval": [ | |
| false, | |
| true | |
| ], | |
| "adamw": { | |
| "lr": 4.0e-4 | |
| }, | |
| "reducelronplateau": { | |
| "factor": 0.8, | |
| "patience": 10, | |
| "min_lr": 1.0e-4 | |
| }, | |
| "dataloader": { | |
| "num_worker": 8, | |
| "pin_memory": true | |
| }, | |
| "sampler": { | |
| "holistic_shuffle": false, | |
| "drop_last": true | |
| } | |
| }, | |
| "inference": { | |
| "comosvc": { | |
| "inference_steps": 40 | |
| } | |
| } | |
| } |