Spaces:

tsi-org
/

tango

Paused

App Files Files Community

deepanway commited on May 1, 2023

Commit

a664672

1 Parent(s): 2301775

Update app.py

Browse files

Files changed (1) hide show

app.py +11 -4

app.py CHANGED Viewed

@@ -79,9 +79,14 @@ def gradio_generate(prompt, steps, guidance):
     return output_filename
-description_text = '''
-TANGO is a latent diffusion model (LDM) for text-to-audio (TTA) generation. TANGO can generate realistic audios including human sounds, animal sounds, natural and artificial sounds and sound effects from textual prompts. We use the frozen instruction-tuned LLM Flan-T5 as the text encoder and train a UNet based diffusion model for audio generation. We perform comparably to current state-of-the-art models for TTA across both objective and subjective metrics, despite training the LDM on a 63 times smaller dataset. We release our model, training, inference code, and pre-trained checkpoints for the research community.
-'''
 # Gradio input and output components
 input_text = gr.inputs.Textbox(lines=2, label="Prompt")
@@ -95,7 +100,7 @@ gr_interface = gr.Interface(
     inputs=[input_text, denoising_steps, guidance_scale],
     outputs=[output_audio],
     title="TANGO: Text to Audio using Instruction-Guided Diffusion",
-    description="Generate audio using TANGO by providing a text prompt.",
     allow_flagging=False,
     examples=[
         ["An audience cheering and clapping"],
@@ -104,7 +109,9 @@ gr_interface = gr.Interface(
         ["A car engine revving"],
         ["A dog barking"],
         ["A cat meowing"],
         ["Emergency sirens wailing"],
         ["Whistling with birds chirping"],
         ["A person snoring"],
         ["Motor vehicles are driving with loud engines and a person whistles"],

     return output_filename
+description_text = "Generate audio using TANGO by providing a text prompt. \
+\n\nLimitations: TANGO is trained on the small AudioCaps dataset so it may not generate good audio \
+samples related to concepts that it has not seen in training (e.g. singing). For the same reason, TANGO \
+is not always able to finely control its generations over textual control prompts. For example, \
+the generations from TANGO for prompts Chopping tomatoes on a wooden table and Chopping potatoes \
+on a metal table are very similar. \
+\n\nWe are currently training another version of TANGO on larger datasets to enhance its generalization, \
+compositional and controllable generation ability."
 # Gradio input and output components
 input_text = gr.inputs.Textbox(lines=2, label="Prompt")
     inputs=[input_text, denoising_steps, guidance_scale],
     outputs=[output_audio],
     title="TANGO: Text to Audio using Instruction-Guided Diffusion",
+    description=description_text,
     allow_flagging=False,
     examples=[
         ["An audience cheering and clapping"],
         ["A car engine revving"],
         ["A dog barking"],
         ["A cat meowing"],
+        ["Wooden table tapping sound while water pouring"],
         ["Emergency sirens wailing"],
+        ["two gunshots followed by birds flying away while chirping"],
         ["Whistling with birds chirping"],
         ["A person snoring"],
         ["Motor vehicles are driving with loud engines and a person whistles"],