Spaces:
Sleeping
Sleeping
| # Universal Metrics Configuration for Versa | |
| # This file contains the configuration for various universal metrics used in speech quality assessment. | |
| # visqol metric | |
| # -- visqol: visual quality of speech | |
| - name: visqol | |
| model: default | |
| # Word error rate with ESPnet-OWSM model | |
| # More model_tag can be from the ESPnet huggingface https://huggingface.co/espnet . | |
| # The default model is `espnet/owsm_v3.1_ebf`. | |
| # --lid: the nbest language tag | |
| - name: lid | |
| model_tag: default | |
| nbest: 1 | |
| # nomad (reference-based) metric | |
| # -- nomad: nomad reference-based model | |
| - name: nomad | |
| model_cache: versa_cache/nomad_pt-models | |
| # srmr related metrics | |
| # -- srmr: speech-to-reverberation modulation energy ratio | |
| - name: srmr | |
| n_cochlear_filters: 23 | |
| low_freq: 125 | |
| min_cf: 4 | |
| max_cf: 128 | |
| fast: True | |
| norm: False | |
| # Emotion similarity calculated based on emo2vec | |
| # --emo2vec_similarity: the emotion similarity with emo2vec | |
| - name: emo2vec_similarity | |
| # noresqa related metrics | |
| # -- noresqa: non-matching reference based speech quality assessment | |
| - name: noresqa | |
| metric_type: 1 #0: NORESQA-score, 1: NORESQA-MOS | |
| # pysepm related metrics | |
| # -- pysepm_fwsegsnr: frequency-weighted segmental SNR | |
| # -- pysepm_llr: Log likelihood ratio | |
| # -- pysepm_wss: weighted spectral slope | |
| # -- pysepm_cd: cepstral distance objective speech quality measure | |
| # -- pysepm_Csig, pysepm_Cbak, pysepm_Covl: composite objective speech quality | |
| # -- pysepm_csii_high, pysepm_csii_mid, pysepm_csii_low: coherence and speech intelligibility index | |
| # -- pysepm_ncm: normalized-covariance measure | |
| - name: pysepm | |
| # nisqa score for speech quality assessment | |
| # -- nisqa_mos_pred: NISQA MOS prediction | |
| # -- nisqa_noi_pred: NISQA noise prediction | |
| # -- nisqa_dis_pred: NISQA distortion prediction | |
| # -- nisqa_col_pred: NISQA color prediction | |
| # --nisqa_loud_pred: NISQA loudness prediction | |
| # NOTE(jiatong): pretrain model can be downloaded with `./tools/setup_nisqa.sh` | |
| - name: nisqa | |
| nisqa_model_path: ./tools/NISQA/weights/nisqa.tar | |
| # discrete speech metrics | |
| # -- speech_bert: speech bert score | |
| # -- speech_bleu: speech bleu score | |
| # -- speech_token_distance: speech token distance score | |
| - name: discrete_speech | |
| # mcd f0 related metrics | |
| # -- mcd: mel cepstral distortion | |
| # -- f0_corr: f0 correlation | |
| # -- f0_rmse: f0 root mean square error | |
| - name: mcd_f0 | |
| f0min: 40 | |
| f0max: 800 | |
| mcep_shift: 5 | |
| mcep_fftl: 1024 | |
| mcep_dim: 39 | |
| mcep_alpha: 0.466 | |
| seq_mismatch_tolerance: 0.1 | |
| power_threshold: -20 | |
| dtw: false | |
| # An overall model on MOS-bench from Sheet toolkit | |
| # --sheet_ssqa: the mos prediction from sheet_ssqa | |
| - name: sheet_ssqa | |
| # pesq related metrics | |
| # -- pesq: perceptual evaluation of speech quality | |
| - name: pesq | |
| # stoi related metrics | |
| # -- stoi: short-time objective intelligibility | |
| - name: stoi | |
| # pseudo subjective metrics | |
| # -- utmos: UT-MOS score | |
| # -- dnsmos: DNS-MOS score | |
| # -- plcmos: PLC-MOS score | |
| # -- aecmos: AEC-MOS score | |
| - name: pseudo_mos | |
| predictor_types: ["utmos", "dnsmos", "plcmos", "singmos", "utmosv2"] | |
| predictor_args: | |
| utmos: | |
| fs: 16000 | |
| dnsmos: | |
| fs: 16000 | |
| plcmos: | |
| fs: 16000 | |
| singmos: | |
| fs: 16000 | |
| utmosv2: | |
| fs: 16000 | |
| # Word error rate with OpenAI-Whisper model | |
| # -- whisper_wer: word error rate of openai-whisper | |
| - name: whisper_wer | |
| model_tag: default | |
| beam_size: 1 | |
| text_cleaner: whisper_basic | |
| # scoreq (reference-based) metric | |
| # -- scoreq_ref: scoreq reference-based model | |
| - name: scoreq_ref | |
| data_domain: natrual | |
| model_cache: versa_cache/scoreq_pt-models | |
| # scoreq (non-reference-based) metric | |
| # -- scoreq_nr: scoreq non-reference-based model | |
| - name: scoreq_nr | |
| data_domain: natural | |
| model_cache: versa_cache/scoreq_pt-models | |
| # Speech Enhancement-based Metrics | |
| # model tag can be any ESPnet-SE huggingface repo | |
| # -- se_si_snr: the SI-SNR from a rerference speech enhancement model | |
| - name: se_snr | |
| model_tag: default | |
| # PAM: Prompting Audio-Language Models for Audio Quality Assessment | |
| # https://github.com/soham97/PAM/tree/main | |
| - name: pam | |
| repro: true | |
| cache_dir: versa_cache/pam | |
| io: soundfile | |
| # TEXT ENCODER CONFIG | |
| text_model: 'gpt2' | |
| text_len: 77 | |
| transformer_embed_dim: 768 | |
| freeze_text_encoder_weights: True | |
| # AUDIO ENCODER CONFIG | |
| audioenc_name: 'HTSAT' | |
| out_emb: 768 | |
| sampling_rate: 44100 | |
| duration: 7 | |
| fmin: 50 | |
| fmax: 8000 #14000 | |
| n_fft: 1024 # 1028 | |
| hop_size: 320 | |
| mel_bins: 64 | |
| window_size: 1024 | |
| # PROJECTION SPACE CONFIG | |
| d_proj: 1024 | |
| temperature: 0.003 | |
| # TRAINING AND EVALUATION CONFIG | |
| num_classes: 527 | |
| batch_size: 1024 | |
| demo: False | |
| # Speaking rate calculating | |
| # --speaking_rate: correct matching words/character counts | |
| - name: speaking_rate | |
| model_tag: default | |
| beam_size: 1 | |
| text_cleaner: whisper_basic | |
| # Audiobox Aesthetics (Unified automatic quality assessment for speech, music, and sound.) | |
| - name: audiobox_aesthetics | |
| batch_size: 1 | |
| cache_dir: versa_cache/audiobox | |
| # ASR-match calculating | |
| # --asr_match_error_rate: correct matching words/character counts | |
| - name: asr_match | |
| model_tag: default | |
| beam_size: 1 | |
| text_cleaner: whisper_basic | |
| # speaker related metrics | |
| # -- spk_similarity: speaker cosine similarity | |
| - name: speaker | |
| model_tag: default | |
| # asvspoof related metrics | |
| # -- asvspoof_score: evaluate how the generated speech is likely to be classifiied by a deepfake classifier | |
| - name: asvspoof_score | |
| # signal related metrics | |
| # -- sir: signal to interference ratio | |
| # -- sar: signal to artifact ratio | |
| # -- sdr: signal to distortion ratio | |
| # -- ci-sdr: scale-invariant signal to distortion ratio | |
| # -- si-snri: scale-invariant signal to noise ratio improvement | |
| - name: signal_metric | |