Spaces:

AmberHeart
/

AetherV1

Build error

App Files Files Community

Wenzheng Chang commited on Mar 31

Commit

cd4da5b

1 Parent(s): d5d6d85

final version

Browse files

Files changed (2) hide show

app.py +33 -42
scripts/demo_gradio.py +40 -62

app.py CHANGED Viewed

@@ -39,7 +39,6 @@ from aether.utils.postprocess_utils import (  # noqa: E402
 )
 from aether.utils.visualize_utils import predictions_to_glb  # noqa: E402
 def seed_all(seed: int = 0) -> None:
     """
     Set random seeds of all components.
@@ -73,33 +72,10 @@ pipeline = AetherV1PipelineCogVideoX(
 )
 pipeline.vae.enable_slicing()
 pipeline.vae.enable_tiling()
-# pipeline.to(device)
 def build_pipeline(device: torch.device) -> AetherV1PipelineCogVideoX:
     """Initialize the model pipeline."""
-    # cogvideox_pretrained_model_name_or_path: str = "THUDM/CogVideoX-5b-I2V"
-    # aether_pretrained_model_name_or_path: str = "AetherWorldModel/AetherV1"
-    # pipeline = AetherV1PipelineCogVideoX(
-    #     tokenizer=AutoTokenizer.from_pretrained(
-    #         cogvideox_pretrained_model_name_or_path,
-    #         subfolder="tokenizer",
-    #     ),
-    #     text_encoder=T5EncoderModel.from_pretrained(
-    #         cogvideox_pretrained_model_name_or_path, subfolder="text_encoder"
-    #     ),
-    #     vae=AutoencoderKLCogVideoX.from_pretrained(
-    #         cogvideox_pretrained_model_name_or_path, subfolder="vae"
-    #     ),
-    #     scheduler=CogVideoXDPMScheduler.from_pretrained(
-    #         cogvideox_pretrained_model_name_or_path, subfolder="scheduler"
-    #     ),
-    #     transformer=CogVideoXTransformer3DModel.from_pretrained(
-    #         aether_pretrained_model_name_or_path, subfolder="transformer"
-    #     ),
-    # )
-    # pipeline.vae.enable_slicing()
-    # pipeline.vae.enable_tiling()
     pipeline.to(device)
     return pipeline
@@ -346,21 +322,34 @@ def save_output_files(
     os.makedirs(output_dir, exist_ok=True)
     if pointmap is None and raymap is not None:
-        # Generate pointmap from raymap and disparity
-        smooth_camera = kwargs.get("smooth_camera", True)
-        smooth_method = (
-            kwargs.get("smooth_method", "kalman") if smooth_camera else "none"
-        )
-        pointmap_dict = postprocess_pointmap(
-            disparity,
-            raymap,
-            vae_downsample_scale=8,
-            ray_o_scale_inv=0.1,
-            smooth_camera=smooth_camera,
-            smooth_method=smooth_method,
-        )
-        pointmap = pointmap_dict["pointmap"]
     if poses is None and raymap is not None:
         poses, _, _ = raymap_to_poses(raymap, ray_o_scale_inv=0.1)
@@ -432,7 +421,7 @@ def save_output_files(
             # flip Y axis and X axis of camera position
             flipped_poses[..., 1, 3] = -flipped_poses[..., 1, 3]  # flip Y axis position
             flipped_poses[..., 0, 3] = -flipped_poses[..., 0, 3]  # flip X axis position
             # use flipped point cloud and camera poses
             predictions = {
                 "world_points": flipped_pointmap,
@@ -1512,7 +1501,7 @@ with gr.Blocks(
                         with gr.Column(scale=1):
                             fps = gr.Dropdown(
                                 choices=[8, 10, 12, 15, 24],
-                                value=12,
                                 label="FPS",
                                 info="Frames per second",
                             )
@@ -1816,8 +1805,9 @@ with gr.Blocks(
     run_button.click(
         fn=lambda task_type,
-        video_file,
         image_file,
         goal_file,
         height,
         width,
@@ -1874,7 +1864,7 @@ with gr.Blocks(
                 ]
                 if task_type == "prediction"
                 else [
-                    image_file,
                     goal_file,
                     height,
                     width,
@@ -1897,6 +1887,7 @@ with gr.Blocks(
             task,
             video_input,
             image_input,
             goal_input,
             height,
             width,

 )
 from aether.utils.visualize_utils import predictions_to_glb  # noqa: E402
 def seed_all(seed: int = 0) -> None:
     """
     Set random seeds of all components.
 )
 pipeline.vae.enable_slicing()
 pipeline.vae.enable_tiling()
 def build_pipeline(device: torch.device) -> AetherV1PipelineCogVideoX:
     """Initialize the model pipeline."""
     pipeline.to(device)
     return pipeline
     os.makedirs(output_dir, exist_ok=True)
     if pointmap is None and raymap is not None:
+        # # Generate pointmap from raymap and disparity
+        # smooth_camera = kwargs.get("smooth_camera", True)
+        # smooth_method = (
+        #     kwargs.get("smooth_method", "kalman") if smooth_camera else "none"
+        # )
+        # pointmap_dict = postprocess_pointmap(
+        #     disparity,
+        #     raymap,
+        #     vae_downsample_scale=8,
+        #     ray_o_scale_inv=0.1,
+        #     smooth_camera=smooth_camera,
+        #     smooth_method=smooth_method,
+        # )
+        # pointmap = pointmap_dict["pointmap"]
+        window_result = AetherV1PipelineOutput(
+                rgb=rgb,
+                disparity=disparity,
+                raymap=raymap
+            )
+        window_results = [window_result]
+        window_indices = [0]
+        _, _, poses_from_blend, pointmap = blend_and_merge_window_results(window_results, window_indices, kwargs)
+        # Use poses from blend_and_merge_window_results if poses is None
+        if poses is None:
+            poses = poses_from_blend
     if poses is None and raymap is not None:
         poses, _, _ = raymap_to_poses(raymap, ray_o_scale_inv=0.1)
             # flip Y axis and X axis of camera position
             flipped_poses[..., 1, 3] = -flipped_poses[..., 1, 3]  # flip Y axis position
             flipped_poses[..., 0, 3] = -flipped_poses[..., 0, 3]  # flip X axis position
             # use flipped point cloud and camera poses
             predictions = {
                 "world_points": flipped_pointmap,
                         with gr.Column(scale=1):
                             fps = gr.Dropdown(
                                 choices=[8, 10, 12, 15, 24],
+                                value=24,
                                 label="FPS",
                                 info="Frames per second",
                             )
     run_button.click(
         fn=lambda task_type,
+        video_file,
         image_file,
+        image_input_planning,
         goal_file,
         height,
         width,
                 ]
                 if task_type == "prediction"
                 else [
+                    image_input_planning,
                     goal_file,
                     height,
                     width,
             task,
             video_input,
             image_input,
+            image_input_planning,
             goal_input,
             height,
             width,

scripts/demo_gradio.py CHANGED Viewed

@@ -17,8 +17,10 @@ from diffusers import (
     CogVideoXTransformer3DModel,
 )
 from transformers import AutoTokenizer, T5EncoderModel
-import spaces
 rootutils.setup_root(__file__, indicator=".project-root", pythonpath=True)
@@ -39,7 +41,6 @@ from aether.utils.postprocess_utils import (  # noqa: E402
 )
 from aether.utils.visualize_utils import predictions_to_glb  # noqa: E402
 def seed_all(seed: int = 0) -> None:
     """
     Set random seeds of all components.
@@ -73,33 +74,10 @@ pipeline = AetherV1PipelineCogVideoX(
 )
 pipeline.vae.enable_slicing()
 pipeline.vae.enable_tiling()
-# pipeline.to(device)
 def build_pipeline(device: torch.device) -> AetherV1PipelineCogVideoX:
     """Initialize the model pipeline."""
-    # cogvideox_pretrained_model_name_or_path: str = "THUDM/CogVideoX-5b-I2V"
-    # aether_pretrained_model_name_or_path: str = "AetherWorldModel/AetherV1"
-    # pipeline = AetherV1PipelineCogVideoX(
-    #     tokenizer=AutoTokenizer.from_pretrained(
-    #         cogvideox_pretrained_model_name_or_path,
-    #         subfolder="tokenizer",
-    #     ),
-    #     text_encoder=T5EncoderModel.from_pretrained(
-    #         cogvideox_pretrained_model_name_or_path, subfolder="text_encoder"
-    #     ),
-    #     vae=AutoencoderKLCogVideoX.from_pretrained(
-    #         cogvideox_pretrained_model_name_or_path, subfolder="vae"
-    #     ),
-    #     scheduler=CogVideoXDPMScheduler.from_pretrained(
-    #         cogvideox_pretrained_model_name_or_path, subfolder="scheduler"
-    #     ),
-    #     transformer=CogVideoXTransformer3DModel.from_pretrained(
-    #         aether_pretrained_model_name_or_path, subfolder="transformer"
-    #     ),
-    # )
-    # pipeline.vae.enable_slicing()
-    # pipeline.vae.enable_tiling()
     pipeline.to(device)
     return pipeline
@@ -346,21 +324,34 @@ def save_output_files(
     os.makedirs(output_dir, exist_ok=True)
     if pointmap is None and raymap is not None:
-        # Generate pointmap from raymap and disparity
-        smooth_camera = kwargs.get("smooth_camera", True)
-        smooth_method = (
-            kwargs.get("smooth_method", "kalman") if smooth_camera else "none"
-        )
-        pointmap_dict = postprocess_pointmap(
-            disparity,
-            raymap,
-            vae_downsample_scale=8,
-            ray_o_scale_inv=0.1,
-            smooth_camera=smooth_camera,
-            smooth_method=smooth_method,
-        )
-        pointmap = pointmap_dict["pointmap"]
     if poses is None and raymap is not None:
         poses, _, _ = raymap_to_poses(raymap, ray_o_scale_inv=0.1)
@@ -432,7 +423,7 @@ def save_output_files(
             # flip Y axis and X axis of camera position
             flipped_poses[..., 1, 3] = -flipped_poses[..., 1, 3]  # flip Y axis position
             flipped_poses[..., 0, 3] = -flipped_poses[..., 0, 3]  # flip X axis position
             # use flipped point cloud and camera poses
             predictions = {
                 "world_points": flipped_pointmap,
@@ -461,7 +452,7 @@ def save_output_files(
     return paths
-@spaces.GPU(duration=300)
 def process_reconstruction(
     video_file,
     height,
@@ -586,7 +577,7 @@ def process_reconstruction(
         return None, None, []
-@spaces.GPU(duration=300)
 def process_prediction(
     image_file,
     height,
@@ -718,7 +709,7 @@ def process_prediction(
         return None, None, []
-@spaces.GPU(duration=300)
 def process_planning(
     image_file,
     goal_file,
@@ -1377,21 +1368,6 @@ with gr.Blocks(
         with gr.Row(elem_classes=["main-interface"]):
             with gr.Column(elem_classes=["input-column"]):
-                gpu_time_warning = gr.Markdown(
-                    """
-                    <div class="warning-box">
-                    <strong>⚠️ Warning:</strong><br>
-                    Due to HuggingFace Spaces ZERO GPU quota limitations, only short video reconstruction tasks (less than 100 frames) can be completed online.
-                    <strong>💻 Recommendation:</strong><br>
-                    We strongly encourage you to deploy Aether locally for:
-                    - Processing longer video reconstruction tasks
-                    - Better performance and full access to prediction and planning tasks
-                    Visit our <a href="https://github.com/OpenRobotLab/Aether" target="_blank">GitHub repository</a> for local deployment instructions.
-                    </div>
-                    """,
-                )
                 with gr.Group(elem_classes=["task-selector"]):
                     task = gr.Radio(
                         ["reconstruction", "prediction", "planning"],
@@ -1512,7 +1488,7 @@ with gr.Blocks(
                         with gr.Column(scale=1):
                             fps = gr.Dropdown(
                                 choices=[8, 10, 12, 15, 24],
-                                value=12,
                                 label="FPS",
                                 info="Frames per second",
                             )
@@ -1816,8 +1792,9 @@ with gr.Blocks(
     run_button.click(
         fn=lambda task_type,
-        video_file,
         image_file,
         goal_file,
         height,
         width,
@@ -1874,7 +1851,7 @@ with gr.Blocks(
                 ]
                 if task_type == "prediction"
                 else [
-                    image_file,
                     goal_file,
                     height,
                     width,
@@ -1897,6 +1874,7 @@ with gr.Blocks(
             task,
             video_input,
             image_input,
             goal_input,
             height,
             width,
@@ -1940,4 +1918,4 @@ with gr.Blocks(
 if __name__ == "__main__":
     os.environ["TOKENIZERS_PARALLELISM"] = "false"
-    demo.queue(max_size=20).launch(show_error=True, share=True)

     CogVideoXTransformer3DModel,
 )
 from transformers import AutoTokenizer, T5EncoderModel
+# import spaces
+os.environ['GRADIO_TEMP_DIR'] = '.gradio_cache'
+os.makedirs(os.environ['GRADIO_TEMP_DIR'], exist_ok=True)
 rootutils.setup_root(__file__, indicator=".project-root", pythonpath=True)
 )
 from aether.utils.visualize_utils import predictions_to_glb  # noqa: E402
 def seed_all(seed: int = 0) -> None:
     """
     Set random seeds of all components.
 )
 pipeline.vae.enable_slicing()
 pipeline.vae.enable_tiling()
 def build_pipeline(device: torch.device) -> AetherV1PipelineCogVideoX:
     """Initialize the model pipeline."""
     pipeline.to(device)
     return pipeline
     os.makedirs(output_dir, exist_ok=True)
     if pointmap is None and raymap is not None:
+        # # Generate pointmap from raymap and disparity
+        # smooth_camera = kwargs.get("smooth_camera", True)
+        # smooth_method = (
+        #     kwargs.get("smooth_method", "kalman") if smooth_camera else "none"
+        # )
+        # pointmap_dict = postprocess_pointmap(
+        #     disparity,
+        #     raymap,
+        #     vae_downsample_scale=8,
+        #     ray_o_scale_inv=0.1,
+        #     smooth_camera=smooth_camera,
+        #     smooth_method=smooth_method,
+        # )
+        # pointmap = pointmap_dict["pointmap"]
+        window_result = AetherV1PipelineOutput(
+                rgb=rgb,
+                disparity=disparity,
+                raymap=raymap
+            )
+        window_results = [window_result]
+        window_indices = [0]
+        _, _, poses_from_blend, pointmap = blend_and_merge_window_results(window_results, window_indices, kwargs)
+        # Use poses from blend_and_merge_window_results if poses is None
+        if poses is None:
+            poses = poses_from_blend
     if poses is None and raymap is not None:
         poses, _, _ = raymap_to_poses(raymap, ray_o_scale_inv=0.1)
             # flip Y axis and X axis of camera position
             flipped_poses[..., 1, 3] = -flipped_poses[..., 1, 3]  # flip Y axis position
             flipped_poses[..., 0, 3] = -flipped_poses[..., 0, 3]  # flip X axis position
             # use flipped point cloud and camera poses
             predictions = {
                 "world_points": flipped_pointmap,
     return paths
+# @spaces.GPU(duration=300)
 def process_reconstruction(
     video_file,
     height,
         return None, None, []
+# @spaces.GPU(duration=300)
 def process_prediction(
     image_file,
     height,
         return None, None, []
+# @spaces.GPU(duration=300)
 def process_planning(
     image_file,
     goal_file,
         with gr.Row(elem_classes=["main-interface"]):
             with gr.Column(elem_classes=["input-column"]):
                 with gr.Group(elem_classes=["task-selector"]):
                     task = gr.Radio(
                         ["reconstruction", "prediction", "planning"],
                         with gr.Column(scale=1):
                             fps = gr.Dropdown(
                                 choices=[8, 10, 12, 15, 24],
+                                value=24,
                                 label="FPS",
                                 info="Frames per second",
                             )
     run_button.click(
         fn=lambda task_type,
+        video_file,
         image_file,
+        image_input_planning,
         goal_file,
         height,
         width,
                 ]
                 if task_type == "prediction"
                 else [
+                    image_input_planning,
                     goal_file,
                     height,
                     width,
             task,
             video_input,
             image_input,
+            image_input_planning,
             goal_input,
             height,
             width,
 if __name__ == "__main__":
     os.environ["TOKENIZERS_PARALLELISM"] = "false"
+    demo.queue(max_size=20).launch(show_error=True, share=False, server_port=7860)