Spaces:

fluxions
/

vui-space

Running on Zero

App Files Files Community

Harry Coultas Blum commited on Jun 6

Commit

1a21598

1 Parent(s): c64babc

trying to cast

Browse files

Files changed (2) hide show

requirements.txt +0 -1
vui/inference.py +6 -29

requirements.txt CHANGED Viewed

@@ -7,7 +7,6 @@ numba
 numpy
 feedparser
 pydantic
-pyannote.audio
 soundfile
 tiktoken
 torchaudio

 numpy
 feedparser
 pydantic
 soundfile
 tiktoken
 torchaudio

vui/inference.py CHANGED Viewed

@@ -4,14 +4,12 @@ import time
 import inflect
 import torch
 import torch.nn.functional as F
-import torchaudio
 from torchaudio.transforms import Resample
 from torch import Tensor
 from torch.nn.attention import SDPBackend, sdpa_kernel
 from vui.model import Vui
 from vui.sampling import multinomial, sample_top_k, sample_top_p, sample_top_p_top_k
-from vui.vad import detect_voice_activity as vad
 resample = Resample(22050, 16000).cuda()
@@ -154,7 +152,7 @@ def generate(
 ):
     text = simple_clean(text)
     with (
-        torch.amp.autocast("cuda", torch.bfloat16, True),
         sdpa_kernel([SDPBackend.MATH]),
     ):
         t1 = time.perf_counter()
@@ -330,15 +328,8 @@ def render(
         )
         codes = codes[..., :-10]
         audio = self.codec.from_indices(codes)
-        paudio = resample(audio[0])
-        results = vad(paudio)
-        if len(results):
-            # Cut the audio based on VAD results, add 200ms silence at end
-            s, e = results[0][0], results[-1][1]
-            return audio[..., int(s * SR) : int((e + 0.2) * SR)].cpu()
         raise Exception("Failed to render")
     # Otherwise we have to do some clever chaining!
@@ -374,24 +365,10 @@ def render(
                 )
                 codes = codes[..., :-10]
-                audio = self.codec.from_indices(codes)
-                # Resample for VAD
-                paudio = torchaudio.functional.resample(audio[0], 22050, 16000)
-                results = vad(paudio)
-                run = len(results) == 0
-                if len(results):
-                    prev_text = line
-                    # Cut the audio based on VAD results, add 200ms silence at end
-                    s, e = results[0][0], results[0][1]
-                    codes = codes[..., int(s * HZ) : int(e * HZ)]
-                    prev_codes = codes
-                    audio = audio[..., int(s * SR) : int((e + 0.2) * SR)].cpu()
-                    audios.append(audio)
-                else:
-                    prev_codes = orig_codes
-                    prev_text = ""
             except KeyboardInterrupt:
                 break
             except RuntimeError as e:

 import inflect
 import torch
 import torch.nn.functional as F
 from torchaudio.transforms import Resample
 from torch import Tensor
 from torch.nn.attention import SDPBackend, sdpa_kernel
 from vui.model import Vui
 from vui.sampling import multinomial, sample_top_k, sample_top_p, sample_top_p_top_k
 resample = Resample(22050, 16000).cuda()
 ):
     text = simple_clean(text)
     with (
+        torch.autocast("cuda", torch.bfloat16, True),
         sdpa_kernel([SDPBackend.MATH]),
     ):
         t1 = time.perf_counter()
         )
         codes = codes[..., :-10]
         audio = self.codec.from_indices(codes)
+        return audio
         raise Exception("Failed to render")
     # Otherwise we have to do some clever chaining!
                 )
                 codes = codes[..., :-10]
+                paudio = self.codec.from_indices(codes)
+                prev_text = line
+                prev_codes = codes
+                audios.append(paudio)
             except KeyboardInterrupt:
                 break
             except RuntimeError as e: