Spaces:
Build error
Build error
Update audio_foundation_models.py
Browse files- audio_foundation_models.py +27 -10
audio_foundation_models.py
CHANGED
|
@@ -190,7 +190,7 @@ class I2A:
|
|
| 190 |
|
| 191 |
@prompts(name="Generate Audio From The Image",
|
| 192 |
description="useful for when you want to generate an audio "
|
| 193 |
-
"based on an image."
|
| 194 |
"The input to this tool should be a string, "
|
| 195 |
"representing the image_path. ")
|
| 196 |
|
|
@@ -237,6 +237,23 @@ class I2A:
|
|
| 237 |
print(f"Processed I2a.run, image_filename: {image}, audio_filename: {audio_filename}")
|
| 238 |
return audio_filename
|
| 239 |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 240 |
class T2S:
|
| 241 |
def __init__(self, device= None):
|
| 242 |
if device is None:
|
|
@@ -394,14 +411,6 @@ class Inpaint:
|
|
| 394 |
input_wav = ori_wav[:input_len]
|
| 395 |
mel = TRANSFORMS_16000(input_wav)
|
| 396 |
return mel
|
| 397 |
-
def show_mel_fn(self, input_audio_path):
|
| 398 |
-
crop_len = 500 # the full mel cannot be showed due to gradio's Image bug when using tool='sketch'
|
| 399 |
-
crop_mel = self.gen_mel(input_audio_path)[:,:crop_len]
|
| 400 |
-
color_mel = self.cmap_transform(crop_mel)
|
| 401 |
-
image = Image.fromarray((color_mel*255).astype(np.uint8))
|
| 402 |
-
image_filename = os.path.join('image', str(uuid.uuid4())[0:8] + ".png")
|
| 403 |
-
image.save(image_filename)
|
| 404 |
-
return image_filename
|
| 405 |
def inpaint(self, batch, seed, ddim_steps, num_samples=1, W=512, H=512):
|
| 406 |
model = self.sampler.model
|
| 407 |
|
|
@@ -432,7 +441,7 @@ class Inpaint:
|
|
| 432 |
inapint_wav = self.vocoder.vocode(inpainted)
|
| 433 |
|
| 434 |
return inpainted, inapint_wav
|
| 435 |
-
def
|
| 436 |
SAMPLE_RATE = 16000
|
| 437 |
torch.set_grad_enabled(False)
|
| 438 |
mel_img = Image.open(mel_and_mask['image'])
|
|
@@ -462,6 +471,14 @@ class Inpaint:
|
|
| 462 |
audio_filename = os.path.join('audio', str(uuid.uuid4())[0:8] + ".wav")
|
| 463 |
soundfile.write(audio_filename, gen_wav, samplerate = 16000)
|
| 464 |
return image_filename, audio_filename
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 465 |
|
| 466 |
class ASR:
|
| 467 |
def __init__(self, device):
|
|
|
|
| 190 |
|
| 191 |
@prompts(name="Generate Audio From The Image",
|
| 192 |
description="useful for when you want to generate an audio "
|
| 193 |
+
"based on an image. "
|
| 194 |
"The input to this tool should be a string, "
|
| 195 |
"representing the image_path. ")
|
| 196 |
|
|
|
|
| 237 |
print(f"Processed I2a.run, image_filename: {image}, audio_filename: {audio_filename}")
|
| 238 |
return audio_filename
|
| 239 |
|
| 240 |
+
class TTS:
|
| 241 |
+
def __init__(self, device=None):
|
| 242 |
+
self.inferencer = TTSInference(device)
|
| 243 |
+
|
| 244 |
+
@prompts(name="Synthesize Speech Given the User Input Text",
|
| 245 |
+
description="useful for when you want to convert a user input text into speech audio it saved it to a file."
|
| 246 |
+
"The input to this tool should be a string, "
|
| 247 |
+
"representing the text used to be converted to speech.")
|
| 248 |
+
|
| 249 |
+
def inference(self, text):
|
| 250 |
+
global temp_audio_filename
|
| 251 |
+
inp = {"text": text}
|
| 252 |
+
out = self.inferencer.infer_once(inp)
|
| 253 |
+
audio_filename = os.path.join('audio', str(uuid.uuid4())[0:8] + ".wav")
|
| 254 |
+
soundfile.write(audio_filename, out, samplerate = 22050)
|
| 255 |
+
return audio_filename
|
| 256 |
+
|
| 257 |
class T2S:
|
| 258 |
def __init__(self, device= None):
|
| 259 |
if device is None:
|
|
|
|
| 411 |
input_wav = ori_wav[:input_len]
|
| 412 |
mel = TRANSFORMS_16000(input_wav)
|
| 413 |
return mel
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 414 |
def inpaint(self, batch, seed, ddim_steps, num_samples=1, W=512, H=512):
|
| 415 |
model = self.sampler.model
|
| 416 |
|
|
|
|
| 441 |
inapint_wav = self.vocoder.vocode(inpainted)
|
| 442 |
|
| 443 |
return inpainted, inapint_wav
|
| 444 |
+
def predict(self, input_audio, mel_and_mask, seed = 55, ddim_steps = 100):
|
| 445 |
SAMPLE_RATE = 16000
|
| 446 |
torch.set_grad_enabled(False)
|
| 447 |
mel_img = Image.open(mel_and_mask['image'])
|
|
|
|
| 471 |
audio_filename = os.path.join('audio', str(uuid.uuid4())[0:8] + ".wav")
|
| 472 |
soundfile.write(audio_filename, gen_wav, samplerate = 16000)
|
| 473 |
return image_filename, audio_filename
|
| 474 |
+
def inference(self, input_audio_path):
|
| 475 |
+
crop_len = 500 # the full mel cannot be showed due to gradio's Image bug when using tool='sketch'
|
| 476 |
+
crop_mel = self.gen_mel(input_audio_path)[:,:crop_len]
|
| 477 |
+
color_mel = self.cmap_transform(crop_mel)
|
| 478 |
+
image = Image.fromarray((color_mel*255).astype(np.uint8))
|
| 479 |
+
image_filename = os.path.join('image', str(uuid.uuid4())[0:8] + ".png")
|
| 480 |
+
image.save(image_filename)
|
| 481 |
+
return image_filename
|
| 482 |
|
| 483 |
class ASR:
|
| 484 |
def __init__(self, device):
|