Spaces:

tonychenxyz
/

emo-knob

Runtime error

App Files Files Community

tonychenxyz commited on Oct 3, 2024

Commit

9bfcf61

1 Parent(s): e9585f6

fixed demo

Browse files

Files changed (3) hide show

all_emo_dirs.pkl +2 -2
app.py +115 -89
fam/llm/__pycache__/fast_inference_utils.cpython-39.pyc +0 -0

all_emo_dirs.pkl CHANGED Viewed

@@ -1,3 +1,3 @@
 version https://git-lfs.github.com/spec/v1
-oid sha256:beadd1f3c7eada0fa99dbdecc5c370036c1c044955a02f019f879bdc6f5fefcb
-size 20343

 version https://git-lfs.github.com/spec/v1
+oid sha256:3160074617894c8a0fb888fac217b3c4ae0a647e4b218aa498d2ff356e040f9e
+size 21612

app.py CHANGED Viewed

@@ -2,7 +2,7 @@
 import os
 import subprocess
 import sys
-import spaces
 def install(package):
     if '=' in package:
@@ -21,9 +21,12 @@ def install(package):
 # install('gradio==4.44.0')
 # install('spacy==3.7')
 is_prod = True
 if os.environ.get('PROD_MODE') == 'local':
     is_prod = False
 import pickle
@@ -42,37 +45,38 @@ if not is_prod:
     os.environ['PATH'] += os.pathsep + ffmpeg_path
-import shutil
-import tempfile
-import time
-from pathlib import Path
-import librosa
-import torch
-from huggingface_hub import snapshot_download
-from fam.llm.adapters import FlattenedInterleavedEncodec2Codebook
-from fam.llm.decoders import EncodecDecoder
-from fam.llm.fast_inference_utils import build_model, main
-from fam.llm.inference import (
-    EncodecDecoder,
-    InferenceConfig,
-    Model,
-    TiltedEncodec,
-    TrainedBPETokeniser,
-    get_cached_embedding,
-    get_cached_file,
-    get_enhancer,
-)
-from fam.llm.utils import (
-    check_audio_file,
-    get_default_dtype,
-    get_device,
-    normalize_text,
-)
-debug = False
 DESCRIPTION = ""
 if not torch.cuda.is_available():
@@ -83,7 +87,8 @@ if torch.cuda.is_available():
         seed = 1337
         output_dir = "outputs"
         _dtype = get_default_dtype()
-        _device = 'cuda:0'
         _model_dir = snapshot_download(repo_id=model_name)
         first_stage_adapter = FlattenedInterleavedEncodec2Codebook(end_of_audio_token=1024)
         output_dir = output_dir
@@ -116,7 +121,6 @@ if torch.cuda.is_available():
             compile_prefill=True,
         )
-@spaces.GPU
 def generate_sample(text, emo_dir = None, source_path = None, emo_path = None, neutral_path = None, strength = 0.1, top_p = 0.95, guidance_scale = 3.0, preset_dropdown = None, toggle = None):
     print('text', text)
@@ -270,32 +274,46 @@ def change_voice_selection_layout(choice):
 def change_emotion_selection_layout(choice):
     if choice == EMO_NAMES[0]:
-        return [gr.update(visible=True)]
-    return [gr.update(visible=False)]
 title = """
 </style>
 <h1 style="margin-top: 10px;" class="page-title">Demo for <span style="margin-left: 10px;background-color: #E0FEE4;padding: 15px;border-radius: 10px;">🎛️ EmoKnob</span></h1>
 """
 description = """
-- While existing TTS services do not allow fine-grained control over emotions, EmoKnob allows users to control emotion in speech with few-shot samples.
 - In this demo, you can select from a few preset voices and upload your own emotional samples to clone.
-- You can then use preset emotion or upload your own emotional-neutral sample pair to control emotions.
 - You can adjust the strength of the emotion by using the slider.
 EmoKnob is uses [MetaVoice](https://github.com/metavoiceio/metavoice-src) as voice cloning backbone.
 """
-with gr.Blocks(title="EmoKnob Demo") as demo:
     gr.Markdown(title)
     gr.Markdown(description)
-    gr.Image("emo-knob-teaser-1.svg", show_label=False, container=False)
-    with gr.Row():
-        gr.Markdown(description)
     with gr.Row():
         with gr.Column():
@@ -305,7 +323,57 @@ with gr.Blocks(title="EmoKnob Demo") as demo:
                 value="To be or not to be, that is the question.",
             )
             with gr.Row(), gr.Column():
                 # voice settings
@@ -324,47 +392,11 @@ with gr.Blocks(title="EmoKnob Demo") as demo:
                     label="Speaker similarity - How closely to match speaker identity and speech style.",
                 )
-                strength = gr.Slider(
-                    value=0.1,
-                    minimum=0.0,
-                    maximum=5.0,
-                    step=0.01,
-                    label="Strength - how strong the emotion is. Setting it to too large a value may result in unstable output.",
-                )
-                # voice select
-                toggle = gr.Radio(choices=RADIO_CHOICES, label="Choose voice", value=RADIO_CHOICES[0])
-            with gr.Row(visible=True) as row_1:
-                preset_dropdown = gr.Dropdown(
-                    PRESET_VOICES.keys(), label="Preset voices", value=list(PRESET_VOICES.keys())[0]
-                )
-                with gr.Accordion("Preview: Preset voices", open=False):
-                    for label, path in PRESET_VOICES.items():
-                        gr.Audio(value=path, label=label)
-            with gr.Row(visible=False) as row_2:
-                upload_target = gr.Audio(
-                    sources=["upload"],
-                    type="filepath",
-                    label="Upload a clean sample to clone.",
-                )
-            with gr.Row():
-                emotion_name = gr.Radio(choices=EMO_NAMES, label="Emotion", value=EMO_NAMES[0])
-            with gr.Row(visible=True) as row_3:
-                upload_neutral = gr.Audio(
-                    sources=["upload"],
-                    type="filepath",
-                    label="Upload a neutral sample to compute the emotion direction. Should be same speaker as the emotional sample.",
-                )
-                upload_emo = gr.Audio(
-                    sources=["upload"],
-                    type="filepath",
-                    label="Upload an emotional sample to compute the emotion direction. Should be same speaker as the neutral sample.",
-                )
             toggle.change(
                 change_voice_selection_layout,
@@ -372,12 +404,6 @@ with gr.Blocks(title="EmoKnob Demo") as demo:
                 outputs=[row_1, row_2],
             )
-            # emotion_name.change(
-            #     change_emotion_selection_layout,
-            #     inputs=emotion_name,
-            #     outputs=[row_3],
-            # )
         with gr.Column():
             speech = gr.Audio(
                 type="filepath",

 import os
 import subprocess
 import sys
 def install(package):
     if '=' in package:
 # install('gradio==4.44.0')
 # install('spacy==3.7')
+debug = False
 is_prod = True
 if os.environ.get('PROD_MODE') == 'local':
     is_prod = False
+else:
+    debug = False
 import pickle
     os.environ['PATH'] += os.pathsep + ffmpeg_path
+import torch
+if not debug:
+    import shutil
+    import tempfile
+    import time
+    from pathlib import Path
+    import librosa
+    from huggingface_hub import snapshot_download
+    from fam.llm.adapters import FlattenedInterleavedEncodec2Codebook
+    from fam.llm.decoders import EncodecDecoder
+    from fam.llm.fast_inference_utils import build_model, main
+    from fam.llm.inference import (
+        EncodecDecoder,
+        InferenceConfig,
+        Model,
+        TiltedEncodec,
+        TrainedBPETokeniser,
+        get_cached_embedding,
+        get_cached_file,
+        get_enhancer,
+    )
+    from fam.llm.utils import (
+        check_audio_file,
+        get_default_dtype,
+        get_device,
+        normalize_text,
+    )
 DESCRIPTION = ""
 if not torch.cuda.is_available():
         seed = 1337
         output_dir = "outputs"
         _dtype = get_default_dtype()
+        # _device = 'cuda:0'
         _model_dir = snapshot_download(repo_id=model_name)
         first_stage_adapter = FlattenedInterleavedEncodec2Codebook(end_of_audio_token=1024)
         output_dir = output_dir
             compile_prefill=True,
         )
 def generate_sample(text, emo_dir = None, source_path = None, emo_path = None, neutral_path = None, strength = 0.1, top_p = 0.95, guidance_scale = 3.0, preset_dropdown = None, toggle = None):
     print('text', text)
 def change_emotion_selection_layout(choice):
     if choice == EMO_NAMES[0]:
+        return [gr.update(visible=True), gr.update(visible=True), gr.update(visible=True)]
+    else:
+        return [gr.update(visible=False), gr.update(visible=False), gr.update(visible=False)]
 title = """
+<!-- Google Tag Manager -->
+<script>(function(w,d,s,l,i){w[l]=w[l]||[];w[l].push({'gtm.start':
+new Date().getTime(),event:'gtm.js'});var f=d.getElementsByTagName(s)[0],
+j=d.createElement(s),dl=l!='dataLayer'?'&l='+l:'';j.async=true;j.src=
+'https://www.googletagmanager.com/gtm.js?id='+i+dl;f.parentNode.insertBefore(j,f);
+})(window,document,'script','dataLayer','GTM-5N27BQH8');</script>
+<!-- End Google Tag Manager -->
 </style>
 <h1 style="margin-top: 10px;" class="page-title">Demo for <span style="margin-left: 10px;background-color: #E0FEE4;padding: 15px;border-radius: 10px;">🎛️ EmoKnob</span></h1>
+<!-- Google Tag Manager (noscript) -->
+<noscript><iframe src="https://www.googletagmanager.com/ns.html?id=GTM-5N27BQH8"
+height="0" width="0" style="display:none;visibility:hidden"></iframe></noscript>
+<!-- End Google Tag Manager (noscript) -->
 """
 description = """
+- EmoKnob applies control of emotion over arbitrary speaker.
+- EmoKnob <b>extracts emotion from a pair of emotional and neutral audio from the same speaker.</b>
 - In this demo, you can select from a few preset voices and upload your own emotional samples to clone.
+- You can then apply control of a preset emotion or extract emotion from your own pair of emotional and neutral audio.
 - You can adjust the strength of the emotion by using the slider.
+Check out our [project page](https://emoknob.cs.columbia.edu/) for more details.
 EmoKnob is uses [MetaVoice](https://github.com/metavoiceio/metavoice-src) as voice cloning backbone.
 """
+with gr.Blocks(title="EmoKnob: EmoKnob: Enhance Voice Cloning with Fine-Grained Emotion Control") as demo:
     gr.Markdown(title)
     gr.Markdown(description)
+    gr.Image("https://raw.githubusercontent.com/tonychenxyz/emoknob/main/docs/assets/emo-knob-teaser-1.svg", show_label=False, container=False)
     with gr.Row():
         with gr.Column():
                 value="To be or not to be, that is the question.",
             )
+                # voice select
+            with gr.Row(), gr.Column():
+                toggle = gr.Radio(choices=RADIO_CHOICES, label="Choose voice", value=RADIO_CHOICES[0])
+                with gr.Row() as row_1:
+                    preset_dropdown = gr.Dropdown(
+                        PRESET_VOICES.keys(), label="Preset voices", value=list(PRESET_VOICES.keys())[0]
+                    )
+                    with gr.Accordion("Preview: Preset voices", open=False):
+                        for label, path in PRESET_VOICES.items():
+                            gr.Audio(value=path, label=label)
+                with gr.Row(visible=False) as row_2:
+                    upload_target = gr.Audio(
+                        sources=["upload"],
+                        type="filepath",
+                        label="Upload a clean sample to clone.",
+                    )
+            with gr.Row(), gr.Column():
+                strength = gr.Slider(
+                        value=0.1,
+                        minimum=0.0,
+                        maximum=1.0,
+                        step=0.01,
+                        label="Strength - how strong the emotion is. Recommended value is between 0.0 and 0.6.",
+                    )
+                with gr.Row():
+                    emotion_name = gr.Radio(choices=EMO_NAMES, label="Emotion", value=EMO_NAMES[1])  # Set default to second option
+                with gr.Row(visible=False) as row_3:
+                    upload_neutral = gr.Audio(
+                        sources=["upload"],
+                        type="filepath",
+                        label="Neutral sample for emotion extraction.",
+                    )
+                    upload_emo = gr.Audio(
+                        sources=["upload"],
+                        type="filepath",
+                        label="Emotional sample for emotion extraction.",
+                    )
             with gr.Row(), gr.Column():
                 # voice settings
                     label="Speaker similarity - How closely to match speaker identity and speech style.",
                 )
+            emotion_name.change(
+                change_emotion_selection_layout,
+                inputs=emotion_name,
+                outputs=[row_3, upload_neutral, upload_emo],
+            )
             toggle.change(
                 change_voice_selection_layout,
                 outputs=[row_1, row_2],
             )
         with gr.Column():
             speech = gr.Audio(
                 type="filepath",

fam/llm/__pycache__/fast_inference_utils.cpython-39.pyc CHANGED Viewed

Binary files a/fam/llm/__pycache__/fast_inference_utils.cpython-39.pyc and b/fam/llm/__pycache__/fast_inference_utils.cpython-39.pyc differ