Video-to-SoundFX

Running

App Files Files Community

fffiloni commited on Jan 23, 2024

Commit

2bb21bd

verified ·

1 Parent(s): d3a9b34

Create app.py

Browse files

Files changed (1) hide show

app.py +105 -0

app.py ADDED Viewed

	@@ -0,0 +1,105 @@

+import gradio as gr
+from gradio_client import Client
+def get_caption(image_in):
+    kosmos2_client = Client("https://ydshieh-kosmos-2.hf.space/")
+    kosmos2_result = kosmos2_client.predict(
+        image_in,	# str (filepath or URL to image) in 'Test Image' Image component
+        "Detailed",	# str in 'Description Type' Radio component
+        fn_index=4
+    )
+    print(f"KOSMOS2 RETURNS: {kosmos2_result}")
+    with open(kosmos2_result[1], 'r') as f:
+        data = json.load(f)
+    reconstructed_sentence = []
+    for sublist in data:
+        reconstructed_sentence.append(sublist[0])
+    full_sentence = ' '.join(reconstructed_sentence)
+    #print(full_sentence)
+    # Find the pattern matching the expected format ("Describe this image in detail:" followed by optional space and then the rest)...
+    pattern = r'^Describe this image in detail:\s*(.*)$'
+    # Apply the regex pattern to extract the description text.
+    match = re.search(pattern, full_sentence)
+    if match:
+        description = match.group(1)
+        print(description)
+    else:
+        print("Unable to locate valid description.")
+    # Find the last occurrence of "."
+    #last_period_index = full_sentence.rfind('.')
+    # Truncate the string up to the last period
+    #truncated_caption = full_sentence[:last_period_index + 1]
+    # print(truncated_caption)
+    #print(f"\n—\nIMAGE CAPTION: {truncated_caption}")
+    return description
+def get_magnet(prompt):
+    amended_prompt = f"No Music. {prompt}"
+    client = Client("https://fffiloni-magnet.hf.space/--replicas/oo8sb/")
+    result = client.predict(
+        "facebook/magnet-small-10secs",	# Literal['facebook/magnet-small-10secs', 'facebook/magnet-medium-10secs', 'facebook/magnet-small-30secs', 'facebook/magnet-medium-30secs', 'facebook/audio-magnet-small', 'facebook/audio-magnet-medium']  in 'Model' Radio component
+        None,	# str  in 'Model Path (custom models)' Textbox component
+        amended_prompt,	# str  in 'Input Text' Textbox component
+        3,	# float  in 'Temperature' Number component
+        0.9,	# float  in 'Top-p' Number component
+        10,	# float  in 'Max CFG coefficient' Number component
+        1,	# float  in 'Min CFG coefficient' Number component
+        20,	# float  in 'Decoding Steps (stage 1)' Number component
+        10,	# float  in 'Decoding Steps (stage 2)' Number component
+        10,	# float  in 'Decoding Steps (stage 3)' Number component
+        10,	# float  in 'Decoding Steps (stage 4)' Number component
+        "prod-stride1 (new!)",	# Literal['max-nonoverlap', 'prod-stride1 (new!)']  in 'Span Scoring' Radio component
+        api_name="/predict_full"
+    )
+    print(result)
+    return result[0]
+def get_audioldm(prompt):
+    client = Client("https://haoheliu-audioldm2-text2audio-text2music.hf.space/")
+    result = client.predict(
+        prompt,	# str in 'Input text' Textbox component
+        "Low quality. Music.",	# str in 'Negative prompt' Textbox component
+        5,	# int | float (numeric value between 5 and 15) in 'Duration (seconds)' Slider component
+        0,	# int | float (numeric value between 0 and 7) in 'Guidance scale' Slider component
+        5,	# int | float in 'Seed' Number component
+        1,	# int | float (numeric value between 1 and 5) in 'Number waveforms to generate' Slider component
+        fn_index=1
+    )
+    print(result)
+    return result
+def infer(image_in):
+    caption = get_caption(image_in)
+    magnet_result = get_magnet(caption)
+    audioldm_result = get_audioldm(caption)
+    return magnet_result, audioldm_result
+with gr.Blocks() as demo:
+    with gr.Column():
+        gr.HTML("""
+        <h2 style="text-align: center;">
+            Image to SFX
+        </h2>
+        <p style="text-align: center;">
+            Compare MAGNet and AudioLDM2 sound effects generation from image caption (Kosmos2)
+        </p>
+        """)
+        with gr.Row():
+            with gr.Column():
+                image_in = gr.Image(sources=["upload"], type="filepath", label="Image input")
+                submit_btn = gr.Button("Submit")
+            with gr.Column():
+                magnet_o = gr.Video(label="MAGNet output")
+                audioldm2_o = gr.Video(label="AudioLDM2 output")
+demo.queue(max_size=10).launch(debug=True)