Spaces:
Runtime error
Runtime error
| import os | |
| import subprocess | |
| import sys | |
| def install(package): | |
| if '=' in package: | |
| package_name, package_version = package.split('==') | |
| else: | |
| package_name = package | |
| package_version = None | |
| try: | |
| subprocess.check_call([sys.executable, "-m", "pip", "uninstall", "-y", package_name]) | |
| print(f"Successfully uninstalled {package}") | |
| except subprocess.CalledProcessError: | |
| print(f"Package {package} was not installed, proceeding with installation") | |
| subprocess.check_call([sys.executable, "-m", "pip", "install", package]) | |
| # install('pydantic==2.0.0') | |
| # install('gradio==4.44.0') | |
| # install('spacy==3.7') | |
| debug = False | |
| is_prod = True | |
| if os.environ.get('PROD_MODE') == 'local': | |
| is_prod = False | |
| else: | |
| debug = False | |
| import pickle | |
| import gradio as gr | |
| import os | |
| if not is_prod: | |
| import os | |
| os.environ['HF_HOME'] = '/proj/afosr/metavoice/cache' | |
| os.environ['TRANSFORMERS_CACHE'] = '/proj/afosr/metavoice/cache' | |
| os.environ['HF_DATASETS_CACHE'] = '/proj/afosr/metavoice/cache' | |
| os.environ['HF_METRICS_CACHE'] = '/proj/afosr/metavoice/cache' | |
| os.environ['HF_MODULES_CACHE'] = '/proj/afosr/metavoice/cache' | |
| ffmpeg_path = '/home/hc3295/ffmpegg_build/bin' | |
| os.environ['PATH'] += os.pathsep + ffmpeg_path | |
| import torch | |
| if not debug: | |
| import shutil | |
| import tempfile | |
| import time | |
| from pathlib import Path | |
| import librosa | |
| from huggingface_hub import snapshot_download | |
| from fam.llm.adapters import FlattenedInterleavedEncodec2Codebook | |
| from fam.llm.decoders import EncodecDecoder | |
| from fam.llm.fast_inference_utils import build_model, main | |
| from fam.llm.inference import ( | |
| EncodecDecoder, | |
| InferenceConfig, | |
| Model, | |
| TiltedEncodec, | |
| TrainedBPETokeniser, | |
| get_cached_embedding, | |
| get_cached_file, | |
| get_enhancer, | |
| ) | |
| from fam.llm.utils import ( | |
| check_audio_file, | |
| get_default_dtype, | |
| get_device, | |
| normalize_text, | |
| ) | |
| DESCRIPTION = "" | |
| if not torch.cuda.is_available(): | |
| DESCRIPTION += "\n<p>Running on CPU 🥶 This demo does not work on CPU.</p>" | |
| if torch.cuda.is_available(): | |
| if not debug: | |
| model_name = "metavoiceio/metavoice-1B-v0.1" | |
| seed = 1337 | |
| output_dir = "outputs" | |
| _dtype = get_default_dtype() | |
| _device = 'cuda:0' | |
| _model_dir = snapshot_download(repo_id=model_name) | |
| first_stage_adapter = FlattenedInterleavedEncodec2Codebook(end_of_audio_token=1024) | |
| output_dir = output_dir | |
| os.makedirs(output_dir, exist_ok=True) | |
| second_stage_ckpt_path = f"{_model_dir}/second_stage.pt" | |
| config_second_stage = InferenceConfig( | |
| ckpt_path=second_stage_ckpt_path, | |
| num_samples=1, | |
| seed=seed, | |
| device=_device, | |
| dtype=_dtype, | |
| compile=False, | |
| init_from="resume", | |
| output_dir=output_dir, | |
| ) | |
| data_adapter_second_stage = TiltedEncodec(end_of_audio_token=1024) | |
| llm_second_stage = Model( | |
| config_second_stage, TrainedBPETokeniser, EncodecDecoder, data_adapter_fn=data_adapter_second_stage.decode | |
| ) | |
| enhancer = get_enhancer("df") | |
| precision = {"float16": torch.float16, "bfloat16": torch.bfloat16}[_dtype] | |
| model, tokenizer, smodel, model_size = build_model( | |
| precision=precision, | |
| checkpoint_path=Path(f"{_model_dir}/first_stage.pt"), | |
| spk_emb_ckpt_path=Path(f"{_model_dir}/speaker_encoder.pt"), | |
| device=_device, | |
| compile=True, | |
| compile_prefill=True, | |
| ) | |
| def generate_sample(text, emo_dir = None, source_path = None, emo_path = None, neutral_path = None, strength = 0.1, top_p = 0.95, guidance_scale = 3.0, preset_dropdown = None, toggle = None): | |
| print('text', text) | |
| print('emo_dir', emo_dir) | |
| print('source_path', source_path) | |
| print('emo_path', emo_path) | |
| print('neutral_path', neutral_path) | |
| print('strength', strength) | |
| print('top_p', top_p) | |
| print('guidance_scale', guidance_scale) | |
| if toggle == RADIO_CHOICES[0]: | |
| source_path = PRESET_VOICES[preset_dropdown] | |
| source_path = get_cached_file(source_path) | |
| check_audio_file(source_path) | |
| source_emb = get_cached_embedding(source_path, smodel).to(device=_device, dtype=precision) | |
| if emo_dir == EMO_NAMES[0]: | |
| emo_path = get_cached_file(emo_path) | |
| check_audio_file(emo_path) | |
| emo_emb = get_cached_embedding(emo_path, smodel).to(device=_device, dtype=precision) | |
| neutral_path = get_cached_file(neutral_path) | |
| check_audio_file(neutral_path) | |
| neutral_emb = get_cached_embedding(neutral_path, smodel).to(device=_device, dtype=precision) | |
| emo_dir = emo_emb - neutral_emb | |
| emo_dir = emo_dir / torch.norm(emo_dir, p=2) | |
| else: | |
| emo_dir = torch.tensor(ALL_EMO_DIRS[emo_dir], device=_device, dtype=precision) | |
| edited_emb = source_emb + strength * emo_dir | |
| edited_emb = edited_emb.to(device=_device, dtype=precision) | |
| temperature=1.0 | |
| text = normalize_text(text) | |
| start = time.time() | |
| # first stage LLM | |
| tokens = main( | |
| model=model, | |
| tokenizer=tokenizer, | |
| model_size=model_size, | |
| prompt=text, | |
| spk_emb=edited_emb, | |
| top_p=torch.tensor(top_p, device=_device, dtype=precision), | |
| guidance_scale=torch.tensor(guidance_scale, device=_device, dtype=precision), | |
| temperature=torch.tensor(temperature, device=_device, dtype=precision), | |
| ) | |
| text_ids, extracted_audio_ids = first_stage_adapter.decode([tokens]) | |
| b_speaker_embs = edited_emb.unsqueeze(0) | |
| # second stage LLM + multi-band diffusion model | |
| wav_files = llm_second_stage( | |
| texts=[text], | |
| encodec_tokens=[torch.tensor(extracted_audio_ids, dtype=torch.int32, device=_device).unsqueeze(0)], | |
| speaker_embs=b_speaker_embs, | |
| batch_size=1, | |
| guidance_scale=None, | |
| top_p=None, | |
| top_k=200, | |
| temperature=1.0, | |
| max_new_tokens=None, | |
| ) | |
| wav_file = wav_files[0] | |
| with tempfile.NamedTemporaryFile(suffix=".wav") as enhanced_tmp: | |
| enhancer(str(wav_file) + ".wav", enhanced_tmp.name) | |
| shutil.copy2(enhanced_tmp.name, str(wav_file) + ".wav") | |
| print(f"\nSaved audio to {wav_file}.wav") | |
| output_path = str(wav_file) + ".wav" | |
| return output_path | |
| ALL_EMO_DIRS = pickle.load(open('all_emo_dirs.pkl', 'rb')) | |
| EMO_NAMES = ['Upload your own sample'] + list(ALL_EMO_DIRS.keys()) | |
| RADIO_CHOICES = ["Preset voices", "Upload your voice"] | |
| MAX_CHARS = 220 | |
| PRESET_VOICES = { | |
| # female | |
| "Bria": "https://cdn.themetavoice.xyz/speakers%2Fbria.mp3", | |
| # male | |
| "Alex": "https://cdn.themetavoice.xyz/speakers/alex.mp3", | |
| "Jacob": "https://cdn.themetavoice.xyz/speakers/jacob.wav", | |
| } | |
| def denormalise_top_p(top_p): | |
| # returns top_p in the range [0.9, 1.0] | |
| return round(0.9 + top_p / 100, 2) | |
| def denormalise_guidance(guidance): | |
| # returns guidance in the range [1.0, 3.0] | |
| return 1 + ((guidance - 1) * (3 - 1)) / (5 - 1) | |
| def _check_file_size(path): | |
| if not path: | |
| return | |
| filesize = os.path.getsize(path) | |
| filesize_mb = filesize / 1024 / 1024 | |
| if filesize_mb >= 50: | |
| raise gr.Error(f"Please upload a sample less than 20MB for voice cloning. Provided: {round(filesize_mb)} MB") | |
| def _handle_edge_cases(to_say, upload_target): | |
| if not to_say: | |
| raise gr.Error("Please provide text to synthesise") | |
| if len(to_say) > MAX_CHARS: | |
| gr.Warning( | |
| f"Max {MAX_CHARS} characters allowed. Provided: {len(to_say)} characters. Truncating and generating speech...Result at the end can be unstable as a result." | |
| ) | |
| if not upload_target: | |
| return | |
| check_audio_file(upload_target) # check file duration to be atleast 30s | |
| _check_file_size(upload_target) | |
| def tts(to_say, top_p, guidance, toggle, preset_dropdown, upload_target): | |
| try: | |
| d_top_p = denormalise_top_p(top_p) | |
| d_guidance = denormalise_guidance(guidance) | |
| _handle_edge_cases(to_say, upload_target) | |
| to_say = to_say if len(to_say) < MAX_CHARS else to_say[:MAX_CHARS] | |
| return TTS_MODEL.synthesise( | |
| text=to_say, | |
| spk_ref_path=PRESET_VOICES[preset_dropdown] if toggle == RADIO_CHOICES[0] else upload_target, | |
| top_p=d_top_p, | |
| guidance_scale=d_guidance, | |
| ) | |
| except Exception as e: | |
| raise gr.Error(f"Something went wrong. Reason: {str(e)}") | |
| def change_voice_selection_layout(choice): | |
| if choice == RADIO_CHOICES[0]: | |
| return [gr.update(visible=True), gr.update(visible=False)] | |
| return [gr.update(visible=False), gr.update(visible=True)] | |
| def change_emotion_selection_layout(choice): | |
| if choice == EMO_NAMES[0]: | |
| return [gr.update(visible=True), gr.update(visible=True), gr.update(visible=True)] | |
| else: | |
| return [gr.update(visible=False), gr.update(visible=False), gr.update(visible=False)] | |
| title = """ | |
| <!-- Google Tag Manager --> | |
| <script>(function(w,d,s,l,i){w[l]=w[l]||[];w[l].push({'gtm.start': | |
| new Date().getTime(),event:'gtm.js'});var f=d.getElementsByTagName(s)[0], | |
| j=d.createElement(s),dl=l!='dataLayer'?'&l='+l:'';j.async=true;j.src= | |
| 'https://www.googletagmanager.com/gtm.js?id='+i+dl;f.parentNode.insertBefore(j,f); | |
| })(window,document,'script','dataLayer','GTM-5N27BQH8');</script> | |
| <!-- End Google Tag Manager --> | |
| </style> | |
| <h1 style="margin-top: 10px;" class="page-title">Demo for <span style="margin-left: 10px;background-color: #E0FEE4;padding: 15px;border-radius: 10px;">🎛️ EmoKnob</span></h1> | |
| <!-- Google Tag Manager (noscript) --> | |
| <noscript><iframe src="https://www.googletagmanager.com/ns.html?id=GTM-5N27BQH8" | |
| height="0" width="0" style="display:none;visibility:hidden"></iframe></noscript> | |
| <!-- End Google Tag Manager (noscript) --> | |
| """ | |
| description = """ | |
| - EmoKnob applies control of emotion over arbitrary speaker. | |
| - EmoKnob <b>extracts emotion from a pair of emotional and neutral audio from the same speaker.</b> | |
| - In this demo, you can select from a few preset voices and upload your own emotional samples to clone. | |
| - You can then apply control of a preset emotion or extract emotion from your own pair of emotional and neutral audio. | |
| - You can adjust the strength of the emotion by using the slider. | |
| Check out our [project page](https://emoknob.cs.columbia.edu/) for more details. | |
| EmoKnob is uses [MetaVoice](https://github.com/metavoiceio/metavoice-src) as voice cloning backbone. | |
| """ | |
| with gr.Blocks(title="EmoKnob: EmoKnob: Enhance Voice Cloning with Fine-Grained Emotion Control") as demo: | |
| gr.Markdown(title) | |
| gr.Markdown(description) | |
| gr.Image("https://raw.githubusercontent.com/tonychenxyz/emoknob/main/docs/assets/emo-knob-teaser-1.svg", show_label=False, container=False) | |
| with gr.Row(): | |
| with gr.Column(): | |
| to_say = gr.TextArea( | |
| label=f"What should I say!? (max {MAX_CHARS} characters).", | |
| lines=4, | |
| value="To be or not to be, that is the question.", | |
| ) | |
| # voice select | |
| with gr.Row(), gr.Column(): | |
| toggle = gr.Radio(choices=RADIO_CHOICES, label="Choose voice", value=RADIO_CHOICES[0]) | |
| with gr.Row() as row_1: | |
| preset_dropdown = gr.Dropdown( | |
| PRESET_VOICES.keys(), label="Preset voices", value=list(PRESET_VOICES.keys())[0] | |
| ) | |
| with gr.Accordion("Preview: Preset voices", open=False): | |
| for label, path in PRESET_VOICES.items(): | |
| gr.Audio(value=path, label=label) | |
| with gr.Row(visible=False) as row_2: | |
| upload_target = gr.Audio( | |
| sources=["upload"], | |
| type="filepath", | |
| label="Upload a clean sample to clone.", | |
| ) | |
| with gr.Row(), gr.Column(): | |
| strength = gr.Slider( | |
| value=0.1, | |
| minimum=0.0, | |
| maximum=1.0, | |
| step=0.01, | |
| label="Strength - how strong the emotion is. Recommended value is between 0.0 and 0.6.", | |
| ) | |
| with gr.Row(): | |
| emotion_name = gr.Radio(choices=EMO_NAMES, label="Emotion", value=EMO_NAMES[1]) # Set default to second option | |
| with gr.Row(visible=False) as row_3: | |
| upload_neutral = gr.Audio( | |
| sources=["upload"], | |
| type="filepath", | |
| label="Neutral sample for emotion extraction.", | |
| ) | |
| upload_emo = gr.Audio( | |
| sources=["upload"], | |
| type="filepath", | |
| label="Emotional sample for emotion extraction.", | |
| ) | |
| with gr.Row(), gr.Column(): | |
| # voice settings | |
| top_p = gr.Slider( | |
| value=0.95, | |
| minimum=0.0, | |
| maximum=10.0, | |
| step=1.0, | |
| label="Speech Stability - improves text following for a challenging speaker", | |
| ) | |
| guidance = gr.Slider( | |
| value=3.0, | |
| minimum=1.0, | |
| maximum=5.0, | |
| step=1.0, | |
| label="Speaker similarity - How closely to match speaker identity and speech style.", | |
| ) | |
| emotion_name.change( | |
| change_emotion_selection_layout, | |
| inputs=emotion_name, | |
| outputs=[row_3, upload_neutral, upload_emo], | |
| ) | |
| toggle.change( | |
| change_voice_selection_layout, | |
| inputs=toggle, | |
| outputs=[row_1, row_2], | |
| ) | |
| with gr.Column(): | |
| speech = gr.Audio( | |
| type="filepath", | |
| label="Model says...", | |
| ) | |
| submit = gr.Button("Generate Speech") | |
| submit.click( | |
| fn=generate_sample, | |
| inputs=[to_say, emotion_name, upload_target, upload_emo, upload_neutral, strength, top_p, guidance, preset_dropdown, toggle], | |
| outputs=speech, | |
| ) | |
| demo.launch() |