Spaces:
Runtime error
Runtime error
Commit
·
e9585f6
1
Parent(s):
abbdb85
added gpu detection
Browse files
app.py
CHANGED
|
@@ -2,6 +2,7 @@
|
|
| 2 |
import os
|
| 3 |
import subprocess
|
| 4 |
import sys
|
|
|
|
| 5 |
|
| 6 |
def install(package):
|
| 7 |
if '=' in package:
|
|
@@ -41,6 +42,7 @@ if not is_prod:
|
|
| 41 |
os.environ['PATH'] += os.pathsep + ffmpeg_path
|
| 42 |
|
| 43 |
|
|
|
|
| 44 |
import shutil
|
| 45 |
import tempfile
|
| 46 |
import time
|
|
@@ -71,45 +73,50 @@ from fam.llm.utils import (
|
|
| 71 |
)
|
| 72 |
|
| 73 |
debug = False
|
| 74 |
-
if not debug:
|
| 75 |
-
model_name = "metavoiceio/metavoice-1B-v0.1"
|
| 76 |
-
seed = 1337
|
| 77 |
-
output_dir = "outputs"
|
| 78 |
-
_dtype = get_default_dtype()
|
| 79 |
-
_device = 'cuda:0'
|
| 80 |
-
_model_dir = snapshot_download(repo_id=model_name)
|
| 81 |
-
first_stage_adapter = FlattenedInterleavedEncodec2Codebook(end_of_audio_token=1024)
|
| 82 |
-
output_dir = output_dir
|
| 83 |
-
os.makedirs(output_dir, exist_ok=True)
|
| 84 |
-
|
| 85 |
-
second_stage_ckpt_path = f"{_model_dir}/second_stage.pt"
|
| 86 |
-
config_second_stage = InferenceConfig(
|
| 87 |
-
ckpt_path=second_stage_ckpt_path,
|
| 88 |
-
num_samples=1,
|
| 89 |
-
seed=seed,
|
| 90 |
-
device=_device,
|
| 91 |
-
dtype=_dtype,
|
| 92 |
-
compile=False,
|
| 93 |
-
init_from="resume",
|
| 94 |
-
output_dir=output_dir,
|
| 95 |
-
)
|
| 96 |
-
data_adapter_second_stage = TiltedEncodec(end_of_audio_token=1024)
|
| 97 |
-
llm_second_stage = Model(
|
| 98 |
-
config_second_stage, TrainedBPETokeniser, EncodecDecoder, data_adapter_fn=data_adapter_second_stage.decode
|
| 99 |
-
)
|
| 100 |
-
enhancer = get_enhancer("df")
|
| 101 |
-
|
| 102 |
-
precision = {"float16": torch.float16, "bfloat16": torch.bfloat16}[_dtype]
|
| 103 |
-
model, tokenizer, smodel, model_size = build_model(
|
| 104 |
-
precision=precision,
|
| 105 |
-
checkpoint_path=Path(f"{_model_dir}/first_stage.pt"),
|
| 106 |
-
spk_emb_ckpt_path=Path(f"{_model_dir}/speaker_encoder.pt"),
|
| 107 |
-
device=_device,
|
| 108 |
-
compile=True,
|
| 109 |
-
compile_prefill=True,
|
| 110 |
-
)
|
| 111 |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 112 |
|
|
|
|
| 113 |
def generate_sample(text, emo_dir = None, source_path = None, emo_path = None, neutral_path = None, strength = 0.1, top_p = 0.95, guidance_scale = 3.0, preset_dropdown = None, toggle = None):
|
| 114 |
|
| 115 |
print('text', text)
|
|
@@ -284,6 +291,7 @@ EmoKnob is uses [MetaVoice](https://github.com/metavoiceio/metavoice-src) as voi
|
|
| 284 |
|
| 285 |
with gr.Blocks(title="EmoKnob Demo") as demo:
|
| 286 |
gr.Markdown(title)
|
|
|
|
| 287 |
gr.Image("emo-knob-teaser-1.svg", show_label=False, container=False)
|
| 288 |
|
| 289 |
with gr.Row():
|
|
@@ -383,4 +391,5 @@ with gr.Blocks(title="EmoKnob Demo") as demo:
|
|
| 383 |
outputs=speech,
|
| 384 |
)
|
| 385 |
|
|
|
|
| 386 |
demo.launch()
|
|
|
|
| 2 |
import os
|
| 3 |
import subprocess
|
| 4 |
import sys
|
| 5 |
+
import spaces
|
| 6 |
|
| 7 |
def install(package):
|
| 8 |
if '=' in package:
|
|
|
|
| 42 |
os.environ['PATH'] += os.pathsep + ffmpeg_path
|
| 43 |
|
| 44 |
|
| 45 |
+
|
| 46 |
import shutil
|
| 47 |
import tempfile
|
| 48 |
import time
|
|
|
|
| 73 |
)
|
| 74 |
|
| 75 |
debug = False
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 76 |
|
| 77 |
+
DESCRIPTION = ""
|
| 78 |
+
if not torch.cuda.is_available():
|
| 79 |
+
DESCRIPTION += "\n<p>Running on CPU 🥶 This demo does not work on CPU.</p>"
|
| 80 |
+
if torch.cuda.is_available():
|
| 81 |
+
if not debug:
|
| 82 |
+
model_name = "metavoiceio/metavoice-1B-v0.1"
|
| 83 |
+
seed = 1337
|
| 84 |
+
output_dir = "outputs"
|
| 85 |
+
_dtype = get_default_dtype()
|
| 86 |
+
_device = 'cuda:0'
|
| 87 |
+
_model_dir = snapshot_download(repo_id=model_name)
|
| 88 |
+
first_stage_adapter = FlattenedInterleavedEncodec2Codebook(end_of_audio_token=1024)
|
| 89 |
+
output_dir = output_dir
|
| 90 |
+
os.makedirs(output_dir, exist_ok=True)
|
| 91 |
+
|
| 92 |
+
second_stage_ckpt_path = f"{_model_dir}/second_stage.pt"
|
| 93 |
+
config_second_stage = InferenceConfig(
|
| 94 |
+
ckpt_path=second_stage_ckpt_path,
|
| 95 |
+
num_samples=1,
|
| 96 |
+
seed=seed,
|
| 97 |
+
device=_device,
|
| 98 |
+
dtype=_dtype,
|
| 99 |
+
compile=False,
|
| 100 |
+
init_from="resume",
|
| 101 |
+
output_dir=output_dir,
|
| 102 |
+
)
|
| 103 |
+
data_adapter_second_stage = TiltedEncodec(end_of_audio_token=1024)
|
| 104 |
+
llm_second_stage = Model(
|
| 105 |
+
config_second_stage, TrainedBPETokeniser, EncodecDecoder, data_adapter_fn=data_adapter_second_stage.decode
|
| 106 |
+
)
|
| 107 |
+
enhancer = get_enhancer("df")
|
| 108 |
+
|
| 109 |
+
precision = {"float16": torch.float16, "bfloat16": torch.bfloat16}[_dtype]
|
| 110 |
+
model, tokenizer, smodel, model_size = build_model(
|
| 111 |
+
precision=precision,
|
| 112 |
+
checkpoint_path=Path(f"{_model_dir}/first_stage.pt"),
|
| 113 |
+
spk_emb_ckpt_path=Path(f"{_model_dir}/speaker_encoder.pt"),
|
| 114 |
+
device=_device,
|
| 115 |
+
compile=True,
|
| 116 |
+
compile_prefill=True,
|
| 117 |
+
)
|
| 118 |
|
| 119 |
+
@spaces.GPU
|
| 120 |
def generate_sample(text, emo_dir = None, source_path = None, emo_path = None, neutral_path = None, strength = 0.1, top_p = 0.95, guidance_scale = 3.0, preset_dropdown = None, toggle = None):
|
| 121 |
|
| 122 |
print('text', text)
|
|
|
|
| 291 |
|
| 292 |
with gr.Blocks(title="EmoKnob Demo") as demo:
|
| 293 |
gr.Markdown(title)
|
| 294 |
+
gr.Markdown(description)
|
| 295 |
gr.Image("emo-knob-teaser-1.svg", show_label=False, container=False)
|
| 296 |
|
| 297 |
with gr.Row():
|
|
|
|
| 391 |
outputs=speech,
|
| 392 |
)
|
| 393 |
|
| 394 |
+
|
| 395 |
demo.launch()
|