Spaces:
Running
on
Zero
Running
on
Zero
Harry Coultas Blum
commited on
Commit
·
1a21598
1
Parent(s):
c64babc
trying to cast
Browse files- requirements.txt +0 -1
- vui/inference.py +6 -29
requirements.txt
CHANGED
|
@@ -7,7 +7,6 @@ numba
|
|
| 7 |
numpy
|
| 8 |
feedparser
|
| 9 |
pydantic
|
| 10 |
-
pyannote.audio
|
| 11 |
soundfile
|
| 12 |
tiktoken
|
| 13 |
torchaudio
|
|
|
|
| 7 |
numpy
|
| 8 |
feedparser
|
| 9 |
pydantic
|
|
|
|
| 10 |
soundfile
|
| 11 |
tiktoken
|
| 12 |
torchaudio
|
vui/inference.py
CHANGED
|
@@ -4,14 +4,12 @@ import time
|
|
| 4 |
import inflect
|
| 5 |
import torch
|
| 6 |
import torch.nn.functional as F
|
| 7 |
-
import torchaudio
|
| 8 |
from torchaudio.transforms import Resample
|
| 9 |
from torch import Tensor
|
| 10 |
from torch.nn.attention import SDPBackend, sdpa_kernel
|
| 11 |
|
| 12 |
from vui.model import Vui
|
| 13 |
from vui.sampling import multinomial, sample_top_k, sample_top_p, sample_top_p_top_k
|
| 14 |
-
from vui.vad import detect_voice_activity as vad
|
| 15 |
|
| 16 |
resample = Resample(22050, 16000).cuda()
|
| 17 |
|
|
@@ -154,7 +152,7 @@ def generate(
|
|
| 154 |
):
|
| 155 |
text = simple_clean(text)
|
| 156 |
with (
|
| 157 |
-
torch.
|
| 158 |
sdpa_kernel([SDPBackend.MATH]),
|
| 159 |
):
|
| 160 |
t1 = time.perf_counter()
|
|
@@ -330,15 +328,8 @@ def render(
|
|
| 330 |
)
|
| 331 |
codes = codes[..., :-10]
|
| 332 |
audio = self.codec.from_indices(codes)
|
|
|
|
| 333 |
|
| 334 |
-
paudio = resample(audio[0])
|
| 335 |
-
results = vad(paudio)
|
| 336 |
-
|
| 337 |
-
if len(results):
|
| 338 |
-
# Cut the audio based on VAD results, add 200ms silence at end
|
| 339 |
-
s, e = results[0][0], results[-1][1]
|
| 340 |
-
return audio[..., int(s * SR) : int((e + 0.2) * SR)].cpu()
|
| 341 |
-
|
| 342 |
raise Exception("Failed to render")
|
| 343 |
|
| 344 |
# Otherwise we have to do some clever chaining!
|
|
@@ -374,24 +365,10 @@ def render(
|
|
| 374 |
)
|
| 375 |
|
| 376 |
codes = codes[..., :-10]
|
| 377 |
-
|
| 378 |
-
|
| 379 |
-
|
| 380 |
-
|
| 381 |
-
results = vad(paudio)
|
| 382 |
-
run = len(results) == 0
|
| 383 |
-
|
| 384 |
-
if len(results):
|
| 385 |
-
prev_text = line
|
| 386 |
-
# Cut the audio based on VAD results, add 200ms silence at end
|
| 387 |
-
s, e = results[0][0], results[0][1]
|
| 388 |
-
codes = codes[..., int(s * HZ) : int(e * HZ)]
|
| 389 |
-
prev_codes = codes
|
| 390 |
-
audio = audio[..., int(s * SR) : int((e + 0.2) * SR)].cpu()
|
| 391 |
-
audios.append(audio)
|
| 392 |
-
else:
|
| 393 |
-
prev_codes = orig_codes
|
| 394 |
-
prev_text = ""
|
| 395 |
except KeyboardInterrupt:
|
| 396 |
break
|
| 397 |
except RuntimeError as e:
|
|
|
|
| 4 |
import inflect
|
| 5 |
import torch
|
| 6 |
import torch.nn.functional as F
|
|
|
|
| 7 |
from torchaudio.transforms import Resample
|
| 8 |
from torch import Tensor
|
| 9 |
from torch.nn.attention import SDPBackend, sdpa_kernel
|
| 10 |
|
| 11 |
from vui.model import Vui
|
| 12 |
from vui.sampling import multinomial, sample_top_k, sample_top_p, sample_top_p_top_k
|
|
|
|
| 13 |
|
| 14 |
resample = Resample(22050, 16000).cuda()
|
| 15 |
|
|
|
|
| 152 |
):
|
| 153 |
text = simple_clean(text)
|
| 154 |
with (
|
| 155 |
+
torch.autocast("cuda", torch.bfloat16, True),
|
| 156 |
sdpa_kernel([SDPBackend.MATH]),
|
| 157 |
):
|
| 158 |
t1 = time.perf_counter()
|
|
|
|
| 328 |
)
|
| 329 |
codes = codes[..., :-10]
|
| 330 |
audio = self.codec.from_indices(codes)
|
| 331 |
+
return audio
|
| 332 |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 333 |
raise Exception("Failed to render")
|
| 334 |
|
| 335 |
# Otherwise we have to do some clever chaining!
|
|
|
|
| 365 |
)
|
| 366 |
|
| 367 |
codes = codes[..., :-10]
|
| 368 |
+
paudio = self.codec.from_indices(codes)
|
| 369 |
+
prev_text = line
|
| 370 |
+
prev_codes = codes
|
| 371 |
+
audios.append(paudio)
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 372 |
except KeyboardInterrupt:
|
| 373 |
break
|
| 374 |
except RuntimeError as e:
|