Spaces:
Paused
Paused
Commit
·
e562afd
1
Parent(s):
3af5e96
update app.py
Browse files
app.py
CHANGED
|
@@ -130,7 +130,7 @@ class FoleyController:
|
|
| 130 |
prompt_textbox,
|
| 131 |
negative_prompt_textbox,
|
| 132 |
ip_adapter_scale,
|
| 133 |
-
|
| 134 |
sampler_dropdown,
|
| 135 |
sample_step_slider,
|
| 136 |
cfg_scale_slider,
|
|
@@ -154,7 +154,7 @@ class FoleyController:
|
|
| 154 |
if seed_textbox != "":
|
| 155 |
torch.manual_seed(int(seed_textbox))
|
| 156 |
generator.manual_seed(int(seed_textbox))
|
| 157 |
-
max_frame_nums =
|
| 158 |
frames, duration = read_frames_with_moviepy(input_video, max_frame_nums=max_frame_nums)
|
| 159 |
if duration >= 10:
|
| 160 |
duration = 10
|
|
@@ -169,7 +169,9 @@ class FoleyController:
|
|
| 169 |
time_condition = time_condition + [-1] * (1024 - len(time_condition))
|
| 170 |
# w -> b c h w
|
| 171 |
time_condition = torch.FloatTensor(time_condition).unsqueeze(0).unsqueeze(0).unsqueeze(0).repeat(1, 1, 256, 1)
|
| 172 |
-
|
|
|
|
|
|
|
| 173 |
images = self.image_processor(images=frames, return_tensors="pt").to(device)
|
| 174 |
image_embeddings = self.image_encoder(**images).image_embeds
|
| 175 |
image_embeddings = torch.mean(image_embeddings, dim=0, keepdim=True).unsqueeze(0).unsqueeze(0)
|
|
@@ -253,18 +255,20 @@ with gr.Blocks(css=css) as demo:
|
|
| 253 |
negative_prompt_textbox = gr.Textbox(value=N_PROMPT, label="Negative prompt", lines=1)
|
| 254 |
|
| 255 |
with gr.Row():
|
| 256 |
-
|
| 257 |
-
|
| 258 |
-
|
| 259 |
-
|
| 260 |
-
)
|
| 261 |
-
|
| 262 |
-
|
| 263 |
-
|
| 264 |
-
|
| 265 |
-
|
| 266 |
-
|
| 267 |
-
|
|
|
|
|
|
|
| 268 |
|
| 269 |
with gr.Row():
|
| 270 |
seed_textbox = gr.Textbox(label="Seed", value=42)
|
|
@@ -273,7 +277,12 @@ with gr.Blocks(css=css) as demo:
|
|
| 273 |
|
| 274 |
generate_button = gr.Button(value="Generate", variant="primary")
|
| 275 |
|
| 276 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 277 |
|
| 278 |
generate_button.click(
|
| 279 |
fn=controller.foley,
|
|
@@ -282,7 +291,7 @@ with gr.Blocks(css=css) as demo:
|
|
| 282 |
prompt_textbox,
|
| 283 |
negative_prompt_textbox,
|
| 284 |
ip_adapter_scale,
|
| 285 |
-
|
| 286 |
sampler_dropdown,
|
| 287 |
sample_step_slider,
|
| 288 |
cfg_scale_slider,
|
|
@@ -292,13 +301,22 @@ with gr.Blocks(css=css) as demo:
|
|
| 292 |
)
|
| 293 |
|
| 294 |
gr.Examples(
|
| 295 |
-
examples= [
|
| 296 |
-
|
| 297 |
-
|
| 298 |
-
|
| 299 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 300 |
],
|
| 301 |
-
inputs=[init_img,prompt_textbox,negative_prompt_textbox,ip_adapter_scale,sampler_dropdown,sample_step_slider,cfg_scale_slider,seed_textbox],
|
|
|
|
|
|
|
|
|
|
| 302 |
)
|
| 303 |
|
| 304 |
demo.queue(10)
|
|
|
|
| 130 |
prompt_textbox,
|
| 131 |
negative_prompt_textbox,
|
| 132 |
ip_adapter_scale,
|
| 133 |
+
temporal_scale,
|
| 134 |
sampler_dropdown,
|
| 135 |
sample_step_slider,
|
| 136 |
cfg_scale_slider,
|
|
|
|
| 154 |
if seed_textbox != "":
|
| 155 |
torch.manual_seed(int(seed_textbox))
|
| 156 |
generator.manual_seed(int(seed_textbox))
|
| 157 |
+
max_frame_nums = 150
|
| 158 |
frames, duration = read_frames_with_moviepy(input_video, max_frame_nums=max_frame_nums)
|
| 159 |
if duration >= 10:
|
| 160 |
duration = 10
|
|
|
|
| 169 |
time_condition = time_condition + [-1] * (1024 - len(time_condition))
|
| 170 |
# w -> b c h w
|
| 171 |
time_condition = torch.FloatTensor(time_condition).unsqueeze(0).unsqueeze(0).unsqueeze(0).repeat(1, 1, 256, 1)
|
| 172 |
+
|
| 173 |
+
# Note that clip need fewer frames
|
| 174 |
+
frames = frames[::10]
|
| 175 |
images = self.image_processor(images=frames, return_tensors="pt").to(device)
|
| 176 |
image_embeddings = self.image_encoder(**images).image_embeds
|
| 177 |
image_embeddings = torch.mean(image_embeddings, dim=0, keepdim=True).unsqueeze(0).unsqueeze(0)
|
|
|
|
| 255 |
negative_prompt_textbox = gr.Textbox(value=N_PROMPT, label="Negative prompt", lines=1)
|
| 256 |
|
| 257 |
with gr.Row():
|
| 258 |
+
ip_adapter_scale = gr.Slider(label="Visual Content Scale", value=1.0, minimum=0, maximum=1)
|
| 259 |
+
temporal_scale = gr.Slider(label="Temporal Align Scale", value=0.2, minimum=0., maximum=1.0)
|
| 260 |
+
|
| 261 |
+
with gr.Accordion("Sampling Settings", open=False):
|
| 262 |
+
with gr.Row():
|
| 263 |
+
sampler_dropdown = gr.Dropdown(
|
| 264 |
+
label="Sampling method",
|
| 265 |
+
choices=list(scheduler_dict.keys()),
|
| 266 |
+
value=list(scheduler_dict.keys())[0],
|
| 267 |
+
)
|
| 268 |
+
sample_step_slider = gr.Slider(
|
| 269 |
+
label="Sampling steps", value=25, minimum=10, maximum=100, step=1
|
| 270 |
+
)
|
| 271 |
+
cfg_scale_slider = gr.Slider(label="CFG Scale", value=7.5, minimum=0, maximum=20)
|
| 272 |
|
| 273 |
with gr.Row():
|
| 274 |
seed_textbox = gr.Textbox(label="Seed", value=42)
|
|
|
|
| 277 |
|
| 278 |
generate_button = gr.Button(value="Generate", variant="primary")
|
| 279 |
|
| 280 |
+
with gr.Column():
|
| 281 |
+
result_video = gr.Video(label="Generated Audio", interactive=False)
|
| 282 |
+
gr.Markdown('**Tips**: <br> \
|
| 283 |
+
1. With strong temporal visual cues in input video, you can scale up the **Temporal Align Scale**. <br>\
|
| 284 |
+
2. **Visual content scale** is the level of semantic alignment with visual content. \
|
| 285 |
+
')
|
| 286 |
|
| 287 |
generate_button.click(
|
| 288 |
fn=controller.foley,
|
|
|
|
| 291 |
prompt_textbox,
|
| 292 |
negative_prompt_textbox,
|
| 293 |
ip_adapter_scale,
|
| 294 |
+
temporal_scale,
|
| 295 |
sampler_dropdown,
|
| 296 |
sample_step_slider,
|
| 297 |
cfg_scale_slider,
|
|
|
|
| 301 |
)
|
| 302 |
|
| 303 |
gr.Examples(
|
| 304 |
+
# examples= [
|
| 305 |
+
# ['examples/videos/51701454.mp4', 'seagulls', '', 1.0, 'DDIM', 25, 7.5, 10014024412012338098],
|
| 306 |
+
# ['examples/videos/42.mp4', '', '', 1.0, 'DDIM', 25, 7.5, 42],
|
| 307 |
+
# ['examples/videos/1.mp4', '', '', 1.0, 'DDIM', 25, 7.5, 93493458],
|
| 308 |
+
# ['examples/videos/2.mp4', '', '', 1.0, 'DDIM', 25, 7.5, 16520432],
|
| 309 |
+
# ],
|
| 310 |
+
examples=[
|
| 311 |
+
['examples/input/case1.mp4', '', '', 1.0, 0.2, 'DDIM', 25, 7.5, 33817921],
|
| 312 |
+
['examples/input/case3.mp4', '', '', 1.0, 0.2,'DDIM', 25, 7.5, 94667578],
|
| 313 |
+
['examples/input/case5.mp4', '', '', 0.75, 0.2,'DDIM', 25, 7.5, 92890876],
|
| 314 |
+
['examples/input/case6.mp4', '', '', 1.0, 0.2, 'DDIM', 25, 7.5, 77015909],
|
| 315 |
],
|
| 316 |
+
inputs=[init_img,prompt_textbox,negative_prompt_textbox,ip_adapter_scale,temporal_scale,sampler_dropdown,sample_step_slider,cfg_scale_slider,seed_textbox],
|
| 317 |
+
cache_examples=True,
|
| 318 |
+
outputs=[result_video],
|
| 319 |
+
fn=controller.foley,
|
| 320 |
)
|
| 321 |
|
| 322 |
demo.queue(10)
|