Spaces:
Sleeping
Sleeping
Update app.py
Browse files
app.py
CHANGED
|
@@ -44,8 +44,19 @@ vocoder_model.eval(inference=False)
|
|
| 44 |
|
| 45 |
|
| 46 |
def plot_spec_align(mel, align):
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 47 |
grid_spec = gridspec.GridSpec(1, 1)
|
| 48 |
-
|
| 49 |
ax = plt.subplot(grid_spec[0])
|
| 50 |
plt.imshow(mel)
|
| 51 |
plt.axis('off')
|
|
@@ -58,8 +69,9 @@ def plot_spec_align(mel, align):
|
|
| 58 |
|
| 59 |
plt.imshow(legend, interpolation='nearest')
|
| 60 |
plt.grid('off')
|
|
|
|
| 61 |
|
| 62 |
-
return
|
| 63 |
|
| 64 |
|
| 65 |
def synthesize(text, gst_1, gst_2, gst_3):
|
|
@@ -80,14 +92,14 @@ def synthesize(text, gst_1, gst_2, gst_3):
|
|
| 80 |
# prepare plot for the output:
|
| 81 |
mel_outputs_postnet = mel_outputs_postnet.squeeze().detach().numpy()
|
| 82 |
alignments = alignments.squeeze().detach().numpy()
|
| 83 |
-
|
| 84 |
|
| 85 |
-
return (22050, audio_numpy),
|
| 86 |
|
| 87 |
|
| 88 |
iface = gr.Interface(fn=synthesize, inputs=[gr.Textbox(label="Input Text"), gr.Slider(0.2, 0.45, label="First style token weight:"),
|
| 89 |
gr.Slider(0.2, 0.45, label="Second style token weight:"), gr.Slider(0.2, 0.45, label="Third style token weight:")],
|
| 90 |
-
outputs=[gr.Audio(label="Generated Speech", type="numpy"), gr.
|
| 91 |
title="Single-Head Attention Tacotron2 with Style Tokens", description=DESCRIPTION)
|
| 92 |
iface.launch()
|
| 93 |
|
|
|
|
| 44 |
|
| 45 |
|
| 46 |
def plot_spec_align(mel, align):
|
| 47 |
+
|
| 48 |
+
fig_mel = plt.figure()
|
| 49 |
+
ax_mel = fig_mel.add_subplot(111)
|
| 50 |
+
ax_mel.imshow(mel)
|
| 51 |
+
ax_mel.set_title('Mel-Scale Spectrogram', fontsize=20)
|
| 52 |
+
|
| 53 |
+
fig_align = plt.figure()
|
| 54 |
+
ax_align = fig_align.add_subplot(111)
|
| 55 |
+
ax_align.imshow(align)
|
| 56 |
+
ax_align.set_title('Alignment', fontsize=20)
|
| 57 |
+
|
| 58 |
+
'''
|
| 59 |
grid_spec = gridspec.GridSpec(1, 1)
|
|
|
|
| 60 |
ax = plt.subplot(grid_spec[0])
|
| 61 |
plt.imshow(mel)
|
| 62 |
plt.axis('off')
|
|
|
|
| 69 |
|
| 70 |
plt.imshow(legend, interpolation='nearest')
|
| 71 |
plt.grid('off')
|
| 72 |
+
'''
|
| 73 |
|
| 74 |
+
return fig_mel, fig_align
|
| 75 |
|
| 76 |
|
| 77 |
def synthesize(text, gst_1, gst_2, gst_3):
|
|
|
|
| 92 |
# prepare plot for the output:
|
| 93 |
mel_outputs_postnet = mel_outputs_postnet.squeeze().detach().numpy()
|
| 94 |
alignments = alignments.squeeze().detach().numpy()
|
| 95 |
+
fig_mel, fig_align = plot_spec_align(mel_outputs_postnet, alignments)
|
| 96 |
|
| 97 |
+
return (22050, audio_numpy), fig_mel, fig_align
|
| 98 |
|
| 99 |
|
| 100 |
iface = gr.Interface(fn=synthesize, inputs=[gr.Textbox(label="Input Text"), gr.Slider(0.2, 0.45, label="First style token weight:"),
|
| 101 |
gr.Slider(0.2, 0.45, label="Second style token weight:"), gr.Slider(0.2, 0.45, label="Third style token weight:")],
|
| 102 |
+
outputs=[gr.Audio(label="Generated Speech", type="numpy"), gr.Plot(label="Spectrogram"), gr.Plot(label="Alignments")],
|
| 103 |
title="Single-Head Attention Tacotron2 with Style Tokens", description=DESCRIPTION)
|
| 104 |
iface.launch()
|
| 105 |
|