Spaces:

AlexK-PL
/

Tacotron2_GST_eng

Sleeping

AlexK-PL commited on Sep 5, 2023

Commit

92be68f

1 Parent(s): 2f6ba98

Update app.py

Files changed (1) hide show

app.py CHANGED Viewed

@@ -44,8 +44,19 @@ vocoder_model.eval(inference=False)
 def plot_spec_align(mel, align):
     grid_spec = gridspec.GridSpec(1, 1)
     ax = plt.subplot(grid_spec[0])
     plt.imshow(mel)
     plt.axis('off')
@@ -58,8 +69,9 @@ def plot_spec_align(mel, align):
     plt.imshow(legend, interpolation='nearest')
     plt.grid('off')
-    return plt
 def synthesize(text, gst_1, gst_2, gst_3):
@@ -80,14 +92,14 @@ def synthesize(text, gst_1, gst_2, gst_3):
     # prepare plot for the output:
     mel_outputs_postnet = mel_outputs_postnet.squeeze().detach().numpy()
     alignments = alignments.squeeze().detach().numpy()
-    plt = plot_spec_align(mel_outputs_postnet, alignments)
-    return (22050, audio_numpy), plt
 iface = gr.Interface(fn=synthesize, inputs=[gr.Textbox(label="Input Text"), gr.Slider(0.2, 0.45, label="First style token weight:"),
                                             gr.Slider(0.2, 0.45, label="Second style token weight:"), gr.Slider(0.2, 0.45, label="Third style token weight:")],
-                     outputs=[gr.Audio(label="Generated Speech", type="numpy"), gr.outputs.Image(type="plot", label="Output"),],
                      title="Single-Head Attention Tacotron2 with Style Tokens", description=DESCRIPTION)
 iface.launch()

 def plot_spec_align(mel, align):
+    fig_mel = plt.figure()
+    ax_mel = fig_mel.add_subplot(111)
+    ax_mel.imshow(mel)
+    ax_mel.set_title('Mel-Scale Spectrogram', fontsize=20)
+    fig_align = plt.figure()
+    ax_align = fig_align.add_subplot(111)
+    ax_align.imshow(align)
+    ax_align.set_title('Alignment', fontsize=20)
+    '''
     grid_spec = gridspec.GridSpec(1, 1)
     ax = plt.subplot(grid_spec[0])
     plt.imshow(mel)
     plt.axis('off')
     plt.imshow(legend, interpolation='nearest')
     plt.grid('off')
+    '''
+    return fig_mel, fig_align
 def synthesize(text, gst_1, gst_2, gst_3):
     # prepare plot for the output:
     mel_outputs_postnet = mel_outputs_postnet.squeeze().detach().numpy()
     alignments = alignments.squeeze().detach().numpy()
+    fig_mel, fig_align = plot_spec_align(mel_outputs_postnet, alignments)
+    return (22050, audio_numpy), fig_mel, fig_align
 iface = gr.Interface(fn=synthesize, inputs=[gr.Textbox(label="Input Text"), gr.Slider(0.2, 0.45, label="First style token weight:"),
                                             gr.Slider(0.2, 0.45, label="Second style token weight:"), gr.Slider(0.2, 0.45, label="Third style token weight:")],
+                     outputs=[gr.Audio(label="Generated Speech", type="numpy"), gr.Plot(label="Spectrogram"), gr.Plot(label="Alignments")],
                      title="Single-Head Attention Tacotron2 with Style Tokens", description=DESCRIPTION)
 iface.launch()