Spaces:
Running
on
Zero
Running
on
Zero
| import gc | |
| import os | |
| import numpy as np | |
| import spaces | |
| import gradio as gr | |
| import torch | |
| from diffusers.training_utils import set_seed | |
| from depthcrafter.depth_crafter_ppl import DepthCrafterPipeline | |
| from depthcrafter.unet import DiffusersUNetSpatioTemporalConditionModelDepthCrafter | |
| import uuid | |
| import random | |
| from huggingface_hub import hf_hub_download | |
| from depthcrafter.utils import read_video_frames, vis_sequence_depth, save_video | |
| examples = [ | |
| ["examples/example_01.mp4", 5, 1.0, 1024, -1, -1], | |
| ["examples/example_02.mp4", 5, 1.0, 1024, -1, -1], | |
| ["examples/example_03.mp4", 5, 1.0, 1024, -1, -1], | |
| ["examples/example_04.mp4", 5, 1.0, 1024, -1, -1], | |
| ["examples/example_05.mp4", 5, 1.0, 1024, -1, -1], | |
| ] | |
| unet = DiffusersUNetSpatioTemporalConditionModelDepthCrafter.from_pretrained( | |
| "tencent/DepthCrafter", | |
| low_cpu_mem_usage=True, | |
| torch_dtype=torch.float16, | |
| ) | |
| pipe = DepthCrafterPipeline.from_pretrained( | |
| "stabilityai/stable-video-diffusion-img2vid-xt", | |
| unet=unet, | |
| torch_dtype=torch.float16, | |
| variant="fp16", | |
| ) | |
| pipe.to("cuda") | |
| def infer_depth( | |
| video: str, | |
| num_denoising_steps: int, | |
| guidance_scale: float, | |
| max_res: int = 1024, | |
| process_length: int = -1, | |
| target_fps: int = -1, | |
| # | |
| save_folder: str = "./demo_output", | |
| window_size: int = 110, | |
| overlap: int = 25, | |
| seed: int = 42, | |
| track_time: bool = True, | |
| save_npz: bool = False, | |
| ): | |
| set_seed(seed) | |
| pipe.enable_xformers_memory_efficient_attention() | |
| frames, target_fps = read_video_frames(video, process_length, target_fps, max_res) | |
| # inference the depth map using the DepthCrafter pipeline | |
| with torch.inference_mode(): | |
| res = pipe( | |
| frames, | |
| height=frames.shape[1], | |
| width=frames.shape[2], | |
| output_type="np", | |
| guidance_scale=guidance_scale, | |
| num_inference_steps=num_denoising_steps, | |
| window_size=window_size, | |
| overlap=overlap, | |
| track_time=track_time, | |
| ).frames[0] | |
| # convert the three-channel output to a single channel depth map | |
| res = res.sum(-1) / res.shape[-1] | |
| # normalize the depth map to [0, 1] across the whole video | |
| res = (res - res.min()) / (res.max() - res.min()) | |
| # visualize the depth map and save the results | |
| vis = vis_sequence_depth(res) | |
| # save the depth map and visualization with the target FPS | |
| save_path = os.path.join(save_folder, os.path.splitext(os.path.basename(video))[0]) | |
| print(f"==> saving results to {save_path}") | |
| os.makedirs(os.path.dirname(save_path), exist_ok=True) | |
| if save_npz: | |
| np.savez_compressed(save_path + ".npz", depth=res) | |
| save_video(res, save_path + "_depth.mp4", fps=target_fps) | |
| save_video(vis, save_path + "_vis.mp4", fps=target_fps) | |
| save_video(frames, save_path + "_input.mp4", fps=target_fps) | |
| # clear the cache for the next video | |
| gc.collect() | |
| torch.cuda.empty_cache() | |
| return [ | |
| save_path + "_input.mp4", | |
| save_path + "_vis.mp4", | |
| # save_path + "_depth.mp4", | |
| ] | |
| def construct_demo(): | |
| with gr.Blocks(analytics_enabled=False) as depthcrafter_iface: | |
| gr.Markdown( | |
| """ | |
| <div align='center'> <h1> DepthCrafter: Generating Consistent Long Depth Sequences for Open-world Videos </span> </h1> \ | |
| <h2 style='font-weight: 450; font-size: 1rem; margin: 0rem'>\ | |
| <a href='https://wbhu.github.io'>Wenbo Hu</a>, \ | |
| <a href='https://scholar.google.com/citations?user=qgdesEcAAAAJ&hl=en'>Xiangjun Gao</a>, \ | |
| <a href='https://xiaoyu258.github.io/'>Xiaoyu Li</a>, \ | |
| <a href='https://scholar.google.com/citations?user=tZ3dS3MAAAAJ&hl=en'>Sijie Zhao</a>, \ | |
| <a href='https://vinthony.github.io/academic'> Xiaodong Cun</a>, \ | |
| <a href='https://yzhang2016.github.io'>Yong Zhang</a>, \ | |
| <a href='https://home.cse.ust.hk/~quan'>Long Quan</a>, \ | |
| <a href='https://scholar.google.com/citations?user=4oXBp9UAAAAJ&hl=en'>Ying Shan</a>\ | |
| </h2> \ | |
| <a style='font-size:18px;color: #000000'>If you find DepthCrafter useful, please help ⭐ the </a>\ | |
| <a style='font-size:18px;color: #FF5DB0' href='https://github.com/Tencent/DepthCrafter'>[Github Repo]</a>\ | |
| <a style='font-size:18px;color: #000000'>, which is important to Open-Source projects. Thanks!</a>\ | |
| <a style='font-size:18px;color: #000000' href='https://arxiv.org/abs/2409.02095'> [ArXiv] </a>\ | |
| <a style='font-size:18px;color: #000000' href='https://depthcrafter.github.io/'> [Project Page] </a> </div> | |
| """ | |
| ) | |
| with gr.Row(equal_height=True): | |
| with gr.Column(scale=1): | |
| input_video = gr.Video(label="Input Video") | |
| # with gr.Tab(label="Output"): | |
| with gr.Column(scale=2): | |
| with gr.Row(equal_height=True): | |
| output_video_1 = gr.Video( | |
| label="Preprocessed video", | |
| interactive=False, | |
| autoplay=True, | |
| loop=True, | |
| show_share_button=True, | |
| scale=5, | |
| ) | |
| output_video_2 = gr.Video( | |
| label="Generated Depth Video", | |
| interactive=False, | |
| autoplay=True, | |
| loop=True, | |
| show_share_button=True, | |
| scale=5, | |
| ) | |
| with gr.Row(equal_height=True): | |
| with gr.Column(scale=1): | |
| with gr.Row(equal_height=False): | |
| with gr.Accordion("Advanced Settings", open=False): | |
| num_denoising_steps = gr.Slider( | |
| label="num denoising steps", | |
| minimum=1, | |
| maximum=25, | |
| value=5, | |
| step=1, | |
| ) | |
| guidance_scale = gr.Slider( | |
| label="cfg scale", | |
| minimum=1.0, | |
| maximum=1.2, | |
| value=1.0, | |
| step=0.1, | |
| ) | |
| max_res = gr.Slider( | |
| label="max resolution", | |
| minimum=512, | |
| maximum=2048, | |
| value=1024, | |
| step=64, | |
| ) | |
| process_length = gr.Slider( | |
| label="process length", | |
| minimum=-1, | |
| maximum=280, | |
| value=60, | |
| step=1, | |
| ) | |
| process_target_fps = gr.Slider( | |
| label="target FPS", | |
| minimum=-1, | |
| maximum=30, | |
| value=15, | |
| step=1, | |
| ) | |
| generate_btn = gr.Button("Generate") | |
| with gr.Column(scale=2): | |
| pass | |
| gr.Examples( | |
| examples=examples, | |
| inputs=[ | |
| input_video, | |
| num_denoising_steps, | |
| guidance_scale, | |
| max_res, | |
| process_length, | |
| process_target_fps, | |
| ], | |
| outputs=[output_video_1, output_video_2], | |
| fn=infer_depth, | |
| cache_examples=True, | |
| cache_mode="lazy", | |
| ) | |
| gr.Markdown( | |
| """ | |
| <span style='font-size:18px;color: #E7CCCC'>Note: | |
| For time quota consideration, we set the default parameters to be more efficient here, | |
| with a trade-off of shorter video length and slightly lower quality. | |
| You may adjust the parameters according to our | |
| <a style='font-size:18px;color: #FF5DB0' href='https://github.com/Tencent/DepthCrafter'>[Github Repo]</a> | |
| for better results if you have enough time quota. | |
| </span> | |
| """ | |
| ) | |
| generate_btn.click( | |
| fn=infer_depth, | |
| inputs=[ | |
| input_video, | |
| num_denoising_steps, | |
| guidance_scale, | |
| max_res, | |
| process_length, | |
| process_target_fps, | |
| ], | |
| outputs=[output_video_1, output_video_2], | |
| ) | |
| return depthcrafter_iface | |
| if __name__ == "__main__": | |
| demo = construct_demo() | |
| demo.queue() | |
| # demo.launch(server_name="0.0.0.0", server_port=12345, debug=True, share=False) | |
| demo.launch(share=True) | |