Test

Paused

App Files Files Community

eeuuia commited on Oct 13

Commit

9f023b7

verified ·

1 Parent(s): b54d196

Update api/ltx/vae_aduc_pipeline.py

Browse files

Files changed (1) hide show

api/ltx/vae_aduc_pipeline.py +40 -80

api/ltx/vae_aduc_pipeline.py CHANGED Viewed

@@ -46,8 +46,8 @@ if str(LTX_VIDEO_REPO_DIR.resolve()) not in sys.path:
 from ltx_video.models.autoencoders.vae_encode import vae_encode, vae_decode, latent_to_pixel_coords
 from ltx_video.models.autoencoders.causal_video_autoencoder import CausalVideoAutoencoder
-from ltx_video.pipelines.pipeline_ltx_video import ConditioningItem
 @dataclass
 class LatentConditioningItem:
@@ -59,7 +59,6 @@ class LatentConditioningItem:
 # --- CLASSE PRINCIPAL DO SERVIÇO VAE ---
 # ==============================================================================
-@log_function_io
 class VaeAducPipeline:
     _instance = None
     _lock = threading.Lock()
@@ -71,7 +70,6 @@ class VaeAducPipeline:
                 cls._instance._initialized = False
             return cls._instance
-    @log_function_io
     def __init__(self):
         if hasattr(self, '_initialized') and self._initialized: return
         with self._lock:
@@ -101,68 +99,24 @@ class VaeAducPipeline:
     # --- MÉTODOS PÚBLICOS DE SERVIÇO ---
     @log_function_io
-    def encode_video(
-        self,
-        video_tensor: torch.Tensor,
-        vae_per_channel_normalize: bool = True
-    ) -> torch.Tensor:
-        """
-        [NOVO] Codifica um tensor de vídeo (pixels) para o espaço latente.
-        Args:
-            video_tensor (torch.Tensor): Tensor de vídeo no formato (B, C, F, H, W) e range [0, 1].
-            vae_per_channel_normalize (bool): Se deve normalizar os latentes por canal.
-        Returns:
-            torch.Tensor: O tensor latente resultante na CPU.
-        """
         logging.info(f"VaeAducPipeline: Encoding video with shape {video_tensor.shape}")
         if not (video_tensor.ndim == 5):
             raise ValueError(f"Input video tensor must be 5D (B, C, F, H, W), but got shape {video_tensor.shape}")
-        # Normaliza o tensor de [0, 1] para [-1, 1]
         video_tensor_normalized = (video_tensor * 2.0) - 1.0
         try:
             video_gpu = video_tensor_normalized.to(self.device, dtype=self.dtype)
             with torch.no_grad():
-                latents = vae_encode(
-                    video_gpu,
-                    self.vae,
-                    vae_per_channel_normalize=vae_per_channel_normalize
-                )
             logging.info(f"VaeAducPipeline: Successfully encoded video to latents of shape {latents.shape}")
             return latents.cpu()
         finally:
             self._cleanup_gpu()
     @log_function_io
-    def decode_and_resize_video(
-        self,
-        latent_tensor: torch.Tensor,
-        target_height: int,
-        target_width: int,
-        decode_timestep: float = 0.05
-    ) -> torch.Tensor:
-        """
-        [NOVO] Decodifica um tensor latente para pixels e o redimensiona para a resolução final.
-        Args:
-            latent_tensor (torch.Tensor): O tensor latente a ser decodificado.
-            target_height (int): A altura final do vídeo.
-            target_width (int): A largura final do vídeo.
-            decode_timestep (float): Timestep para o decoder do VAE, se aplicável.
-        Returns:
-            torch.Tensor: O tensor de vídeo em pixels, redimensionado e na CPU.
-        """
         logging.info(f"VaeAducPipeline: Decoding latents {latent_tensor.shape} and resizing to {target_height}x{target_width}")
-        # 1. Decodificar para pixels (usando a função já existente)
-        # O resultado já virá para a CPU
         pixel_video = self.decode_to_pixels(latent_tensor, decode_timestep)
-        # 2. Redimensionar para o tamanho final
         num_frames = pixel_video.shape[2]
         current_height, current_width = pixel_video.shape[3:]
@@ -170,33 +124,21 @@ class VaeAducPipeline:
             logging.info("VaeAducPipeline: Resizing skipped, already at target resolution.")
             return pixel_video
-        # Aplica a interpolação para redimensionar
         videos_flat = rearrange(pixel_video, "b c f h w -> (b f) c h w")
-        videos_resized = F.interpolate(
-            videos_flat,
-            size=(target_height, target_width),
-            mode="bilinear",
-            align_corners=False,
-        )
         final_video = rearrange(videos_resized, "(b f) c h w -> b c f h w", f=num_frames)
         logging.info(f"VaeAducPipeline: Resized video to final shape {final_video.shape}")
         return final_video
     @log_function_io
     def decode_to_pixels(self, latent_tensor: torch.Tensor, decode_timestep: float = 0.05) -> torch.Tensor:
-        """Decodifica um tensor latente para um tensor de pixels, retornando na CPU."""
         t0 = time.time()
         try:
             latent_tensor_gpu = latent_tensor.to(self.device, dtype=self.dtype)
             num_items = latent_tensor_gpu.shape[0]
             timestep_tensor = torch.tensor([decode_timestep] * num_items, device=self.device, dtype=self.dtype)
             with torch.no_grad():
-                pixels = vae_decode(
-                    latent_tensor_gpu, self.vae, is_video=True,
-                    timestep=timestep_tensor, vae_per_channel_normalize=True
-                )
             logging.info(f"VaeAducPipeline: Decoded latents {latent_tensor.shape} in {time.time() - t0:.2f}s.")
             return pixels.cpu()
         finally:
@@ -213,7 +155,6 @@ class VaeAducPipeline:
         vae_per_channel_normalize: bool = True,
         generator: Optional[torch.Generator] = None,
     ) -> Tuple[torch.Tensor, torch.Tensor, Optional[torch.Tensor], int]:
-        """Prepara tensores de condicionamento a partir de uma lista de itens de pixels ou latentes."""
         init_latents = init_latents.to(self.device, dtype=self.dtype)
         if not conditioning_items:
@@ -236,9 +177,14 @@ class VaeAducPipeline:
                         init_latents[..., :f, :h, :w] = torch.lerp(init_latents[..., :f, :h, :w], latents, item.conditioning_strength)
                         mask[..., :f, :h, :w] = item.conditioning_strength
                     else:
-                        latents_p, coords_p, new_mask, num_new = self._process_extra_item(latents, item, generator)
-                        extra_latents.append(latents_p); extra_coords.append(coords_p); extra_masks.append(new_mask)
-                        num_extra_latents += num_new
             else:
                 for item in conditioning_items:
                     item_resized = self._resize_conditioning_item(item, height, width)
@@ -252,7 +198,9 @@ class VaeAducPipeline:
                         mask[..., :f, ly:ly+h, lx:lx+w] = item.conditioning_strength
                     else:
                         if media_item.shape[2] > 1:
-                            init_latents, mask, latents = self._handle_non_first_sequence(init_latents, mask, latents, item)
                         if latents is not None:
                             latents_p, coords_p, new_mask, num_new = self._process_extra_item(latents, item, generator)
                             extra_latents.append(latents_p); extra_coords.append(coords_p); extra_masks.append(new_mask)
@@ -282,7 +230,7 @@ class VaeAducPipeline:
         if torch.cuda.is_available():
             with torch.cuda.device(self.device): torch.cuda.empty_cache()
-    def _latent_to_pixel_coords(self, c): return latent_to_pixel_coords(c, self.vae, self.transformer.config.causal_temporal_positioning)
     @staticmethod
     def _resize_tensor(m, h, w):
@@ -307,15 +255,27 @@ class VaeAducPipeline:
             if (ys + hi) < h: l = l[..., :-1, :]
         return l, xs // s, ys // s
-    def _handle_non_first_sequence(self, il, m, l, i, np=2, mode="concat"):
-        fl, flp = l.shape[2], np
         if fl > flp:
-            s, e = i.media_frame_number // 8 + flp, i.media_frame_number // 8 + fl
-            il[..., s:e, :, :] = torch.lerp(il[..., s:e, :, :], l[..., flp:, :, :], i.conditioning_strength)
-            m[..., s:e, :, :] = i.conditioning_strength
-        if mode == "concat": l = l[..., :flp, :, :]
-        else: l = None
-        return il, m, l
     def _process_extra_item(self, l, i, g):
         n = randn_tensor(l.shape, generator=g, device=self.device, dtype=self.dtype)
@@ -326,5 +286,5 @@ class VaeAducPipeline:
         nm = torch.full(lp.shape[:2], i.conditioning_strength, dtype=torch.float32, device=self.device)
         return lp, cp, nm, nl
-# --- Instanciação do Singleton ---
 vae_aduc_pipeline = VaeAducPipeline()

 from ltx_video.models.autoencoders.vae_encode import vae_encode, vae_decode, latent_to_pixel_coords
 from ltx_video.models.autoencoders.causal_video_autoencoder import CausalVideoAutoencoder
+from pipeline_ltx_video import ConditioningItem as PipelineConditioningItem
 @dataclass
 class LatentConditioningItem:
 # --- CLASSE PRINCIPAL DO SERVIÇO VAE ---
 # ==============================================================================
 class VaeAducPipeline:
     _instance = None
     _lock = threading.Lock()
                 cls._instance._initialized = False
             return cls._instance
     def __init__(self):
         if hasattr(self, '_initialized') and self._initialized: return
         with self._lock:
     # --- MÉTODOS PÚBLICOS DE SERVIÇO ---
     @log_function_io
+    def encode_video(self, video_tensor: torch.Tensor, vae_per_channel_normalize: bool = True) -> torch.Tensor:
         logging.info(f"VaeAducPipeline: Encoding video with shape {video_tensor.shape}")
         if not (video_tensor.ndim == 5):
             raise ValueError(f"Input video tensor must be 5D (B, C, F, H, W), but got shape {video_tensor.shape}")
         video_tensor_normalized = (video_tensor * 2.0) - 1.0
         try:
             video_gpu = video_tensor_normalized.to(self.device, dtype=self.dtype)
             with torch.no_grad():
+                latents = vae_encode(video_gpu, self.vae, vae_per_channel_normalize=vae_per_channel_normalize)
             logging.info(f"VaeAducPipeline: Successfully encoded video to latents of shape {latents.shape}")
             return latents.cpu()
         finally:
             self._cleanup_gpu()
     @log_function_io
+    def decode_and_resize_video(self, latent_tensor: torch.Tensor, target_height: int, target_width: int, decode_timestep: float = 0.05) -> torch.Tensor:
         logging.info(f"VaeAducPipeline: Decoding latents {latent_tensor.shape} and resizing to {target_height}x{target_width}")
         pixel_video = self.decode_to_pixels(latent_tensor, decode_timestep)
         num_frames = pixel_video.shape[2]
         current_height, current_width = pixel_video.shape[3:]
             logging.info("VaeAducPipeline: Resizing skipped, already at target resolution.")
             return pixel_video
         videos_flat = rearrange(pixel_video, "b c f h w -> (b f) c h w")
+        videos_resized = F.interpolate(videos_flat, size=(target_height, target_width), mode="bilinear", align_corners=False)
         final_video = rearrange(videos_resized, "(b f) c h w -> b c f h w", f=num_frames)
         logging.info(f"VaeAducPipeline: Resized video to final shape {final_video.shape}")
         return final_video
     @log_function_io
     def decode_to_pixels(self, latent_tensor: torch.Tensor, decode_timestep: float = 0.05) -> torch.Tensor:
         t0 = time.time()
         try:
             latent_tensor_gpu = latent_tensor.to(self.device, dtype=self.dtype)
             num_items = latent_tensor_gpu.shape[0]
             timestep_tensor = torch.tensor([decode_timestep] * num_items, device=self.device, dtype=self.dtype)
             with torch.no_grad():
+                pixels = vae_decode(latent_tensor_gpu, self.vae, is_video=True, timestep=timestep_tensor, vae_per_channel_normalize=True)
             logging.info(f"VaeAducPipeline: Decoded latents {latent_tensor.shape} in {time.time() - t0:.2f}s.")
             return pixels.cpu()
         finally:
         vae_per_channel_normalize: bool = True,
         generator: Optional[torch.Generator] = None,
     ) -> Tuple[torch.Tensor, torch.Tensor, Optional[torch.Tensor], int]:
         init_latents = init_latents.to(self.device, dtype=self.dtype)
         if not conditioning_items:
                         init_latents[..., :f, :h, :w] = torch.lerp(init_latents[..., :f, :h, :w], latents, item.conditioning_strength)
                         mask[..., :f, :h, :w] = item.conditioning_strength
                     else:
+                        if latents.shape[2] > 1:
+                            init_latents, mask, latents = self._handle_non_first_sequence(
+                                init_latents, mask, latents, item.media_frame_number, item.conditioning_strength
+                            )
+                        if latents is not None:
+                            latents_p, coords_p, new_mask, num_new = self._process_extra_item(latents, item, generator)
+                            extra_latents.append(latents_p); extra_coords.append(coords_p); extra_masks.append(new_mask)
+                            num_extra_latents += num_new
             else:
                 for item in conditioning_items:
                     item_resized = self._resize_conditioning_item(item, height, width)
                         mask[..., :f, ly:ly+h, lx:lx+w] = item.conditioning_strength
                     else:
                         if media_item.shape[2] > 1:
+                            init_latents, mask, latents = self._handle_non_first_sequence(
+                                init_latents, mask, latents, item.media_frame_number, item.conditioning_strength
+                            )
                         if latents is not None:
                             latents_p, coords_p, new_mask, num_new = self._process_extra_item(latents, item, generator)
                             extra_latents.append(latents_p); extra_coords.append(coords_p); extra_masks.append(new_mask)
         if torch.cuda.is_available():
             with torch.cuda.device(self.device): torch.cuda.empty_cache()
+    def _latent_to_pixel_coords(self, c): return latent_to_pixel_coords(c, self.vae, causal_fix=self.transformer.config.causal_temporal_positioning)
     @staticmethod
     def _resize_tensor(m, h, w):
             if (ys + hi) < h: l = l[..., :-1, :]
         return l, xs // s, ys // s
+    def _handle_non_first_sequence(
+        self,
+        init_latents: torch.Tensor,
+        mask: torch.Tensor,
+        latents: torch.Tensor,
+        media_frame_number: int,
+        conditioning_strength: float,
+        num_prefix=2,
+        mode="concat"
+    ) -> Tuple[torch.Tensor, torch.Tensor, torch.Tensor]:
+        fl, flp = latents.shape[2], num_prefix
         if fl > flp:
+            start = media_frame_number // 8 + flp
+            end = start + fl - flp
+            init_latents[..., start:end, :, :] = torch.lerp(init_latents[..., start:end, :, :], latents[..., flp:, :, :], conditioning_strength)
+            mask[..., start:end, :, :] = conditioning_strength
+        if mode == "concat":
+            latents = latents[..., :flp, :, :]
+        else:
+            latents = None
+        return init_latents, mask, latents
     def _process_extra_item(self, l, i, g):
         n = randn_tensor(l.shape, generator=g, device=self.device, dtype=self.dtype)
         nm = torch.full(lp.shape[:2], i.conditioning_strength, dtype=torch.float32, device=self.device)
         return lp, cp, nm, nl
+# --- Instânciação do Singleton ---
 vae_aduc_pipeline = VaeAducPipeline()