Test4

Paused

App Files Files Community

euiiiia commited on Oct 16

Commit

4aa7f1b

verified ·

1 Parent(s): e6bfe26

Update LTX-Video/ltx_video/pipelines/pipeline_ltx_video (1).py

Browse files

Files changed (1) hide show

LTX-Video/ltx_video/pipelines/pipeline_ltx_video (1).py +115 -221

LTX-Video/ltx_video/pipelines/pipeline_ltx_video (1).py CHANGED Viewed

@@ -190,32 +190,72 @@ def retrieve_timesteps(
     return timesteps, num_inference_steps
 @dataclass
-class LatentConditioningItem:
-    """Item de dados para condicionamento da pipeline LTX."""
-    latent_tensor: torch.Tensor
     media_frame_number: int
     conditioning_strength: float
-@dataclass
-class ConditioningItem:
     """
-    Defines a single frame-conditioning item - a single frame or a sequence of frames.
-    Attributes:
-        media_item (torch.Tensor): shape=(b, 3, f, h, w). The media item to condition on.
-        media_frame_number (int): The start-frame number of the media item in the generated video.
-        conditioning_strength (float): The strength of the conditioning (1.0 = full conditioning).
-        media_x (Optional[int]): Optional left x coordinate of the media item in the generated frame.
-        media_y (Optional[int]): Optional top y coordinate of the media item in the generated frame.
     """
-    media_item: torch.Tensor
-    media_frame_number: int
-    conditioning_strength: float
-    media_x: Optional[int] = None
-    media_y: Optional[int] = None
 class LTXVideoPipeline(DiffusionPipeline):
@@ -1390,242 +1430,106 @@ class LTXVideoPipeline(DiffusionPipeline):
     def prepare_conditioning(
-        self: "LTXVideoPipeline",
-        conditioning_items: Optional[List[Union["ConditioningItem", "LatentConditioningItem"]]],
-        init_latents: torch.Tensor,
-        num_frames: int,
-        height: int,
-        width: int,
-        vae_per_channel_normalize: bool = False,
-        generator=None,
-    ) -> Tuple[torch.Tensor, torch.Tensor, torch.Tensor, int]:
-        if not conditioning_items:
-            init_latents, init_latent_coords = self.patchifier.patchify(latents=init_latents)
-            init_pixel_coords = latent_to_pixel_coords(init_latent_coords, self.vae, causal_fix=self.transformer.config.causal_temporal_positioning)
-            return init_latents, init_pixel_coords, None, 0
-        init_conditioning_mask = torch.zeros_like(init_latents[:, 0, ...], dtype=torch.float32, device=init_latents.device)
-        extra_conditioning_latents, extra_conditioning_pixel_coords, extra_conditioning_mask = [], [], []
-        extra_conditioning_num_latents = 0
-        for item in conditioning_items:
-            if not isinstance(item, LatentConditioningItem):
-                logger.warning("Patch ADUC: Item de condicionamento não é um LatentConditioningItem e será ignorado.")
-                continue
-            media_item_latents = item.latent_tensor.to(dtype=init_latents.dtype, device=init_latents.device)
-            media_frame_number, strength = item.media_frame_number, item.conditioning_strength
-            if media_frame_number == 0:
-                f_l, h_l, w_l = media_item_latents.shape[-3:]
-                init_latents[..., :f_l, :h_l, :w_l] = torch.lerp(init_latents[..., :f_l, :h_l, :w_l], media_item_latents, strength)
-                init_conditioning_mask[..., :f_l, :h_l, :w_l] = strength
-            else:
-                noise = randn_tensor(media_item_latents.shape, generator=generator, device=media_item_latents.device, dtype=media_item_latents.dtype)
-                media_item_latents = torch.lerp(noise, media_item_latents, strength)
-                patched_latents, latent_coords = self.patchifier.patchify(latents=media_item_latents)
-                pixel_coords = latent_to_pixel_coords(latent_coords, self.vae, causal_fix=self.transformer.config.causal_temporal_positioning)
-                pixel_coords[:, 0] += media_frame_number
-                extra_conditioning_num_latents += patched_latents.shape[1]
-                new_mask = torch.full(patched_latents.shape[:2], strength, dtype=torch.float32, device=init_latents.device)
-                extra_conditioning_latents.append(patched_latents)
-                extra_conditioning_pixel_coords.append(pixel_coords)
-                extra_conditioning_mask.append(new_mask)
-        init_latents, init_latent_coords = self.patchifier.patchify(latents=init_latents)
-        init_pixel_coords = latent_to_pixel_coords(init_latent_coords, self.vae, causal_fix=self.transformer.config.causal_temporal_positioning)
-        init_conditioning_mask, _ = self.patchifier.patchify(latents=init_conditioning_mask.unsqueeze(1))
-        init_conditioning_mask = init_conditioning_mask.squeeze(-1)
-        if extra_conditioning_latents:
-            init_latents = torch.cat([*extra_conditioning_latents, init_latents], dim=1)
-            init_pixel_coords = torch.cat([*extra_conditioning_pixel_coords, init_pixel_coords], dim=2)
-            init_conditioning_mask = torch.cat([*extra_conditioning_mask, init_conditioning_mask], dim=1)
-        return init_latents, init_pixel_coords, init_conditioning_mask, extra_conditioning_num_latents
-    def prepare_conditioning1(
         self,
         conditioning_items: Optional[List[ConditioningItem]],
         init_latents: torch.Tensor,
         num_frames: int,
         height: int,
         width: int,
-        vae_per_channel_normalize: bool = False,
         generator=None,
-    ) -> Tuple[torch.Tensor, torch.Tensor, torch.Tensor, int]:
-        """
-        Prepare conditioning tokens based on the provided conditioning items.
-        This method encodes provided conditioning items (video frames or single frames) into latents
-        and integrates them with the initial latent tensor. It also calculates corresponding pixel
-        coordinates, a mask indicating the influence of conditioning latents, and the total number of
-        conditioning latents.
-        Args:
-            conditioning_items (Optional[List[ConditioningItem]]): A list of ConditioningItem objects.
-            init_latents (torch.Tensor): The initial latent tensor of shape (b, c, f_l, h_l, w_l), where
-                `f_l` is the number of latent frames, and `h_l` and `w_l` are latent spatial dimensions.
-            num_frames, height, width: The dimensions of the generated video.
-            vae_per_channel_normalize (bool, optional): Whether to normalize channels during VAE encoding.
-                Defaults to `False`.
-            generator: The random generator
-        Returns:
-            Tuple[torch.Tensor, torch.Tensor, torch.Tensor, int]:
-                - `init_latents` (torch.Tensor): The updated latent tensor including conditioning latents,
-                  patchified into (b, n, c) shape.
-                - `init_pixel_coords` (torch.Tensor): The pixel coordinates corresponding to the updated
-                  latent tensor.
-                - `conditioning_mask` (torch.Tensor): A mask indicating the conditioning-strength of each
-                  latent token.
-                - `num_cond_latents` (int): The total number of latent tokens added from conditioning items.
-        Raises:
-            AssertionError: If input shapes, dimensions, or conditions for applying conditioning are invalid.
-        """
         assert isinstance(self.vae, CausalVideoAutoencoder)
         if conditioning_items:
             batch_size, _, num_latent_frames = init_latents.shape[:3]
             init_conditioning_mask = torch.zeros(
                 init_latents[:, 0, :, :, :].shape,
                 dtype=torch.float32,
                 device=init_latents.device,
             )
             extra_conditioning_latents = []
             extra_conditioning_pixel_coords = []
             extra_conditioning_mask = []
-            extra_conditioning_num_latents = 0  # Number of extra conditioning latents added (should be removed before decoding)
-            # Process each conditioning item
-            for conditioning_item in conditioning_items:
-                conditioning_item = self._resize_conditioning_item(
-                    conditioning_item, height, width
-                )
-                media_item = conditioning_item.media_item
-                media_frame_number = conditioning_item.media_frame_number
-                strength = conditioning_item.conditioning_strength
-                assert media_item.ndim == 5  # (b, c, f, h, w)
-                b, c, n_frames, h, w = media_item.shape
-                assert (
-                    height == h and width == w
-                ) or media_frame_number == 0, f"Dimensions do not match: {height}x{width} != {h}x{w} - allowed only when media_frame_number == 0"
-                assert n_frames % 8 == 1
-                assert (
-                    media_frame_number >= 0
-                    and media_frame_number + n_frames <= num_frames
                 )
-                # Encode the provided conditioning media item
-                media_item_latents = vae_encode(
-                    media_item.to(dtype=self.vae.dtype, device=self.vae.device),
-                    self.vae,
-                    vae_per_channel_normalize=vae_per_channel_normalize,
-                ).to(dtype=init_latents.dtype)
-                # Handle the different conditioning cases
-                if media_frame_number == 0:
-                    # Get the target spatial position of the latent conditioning item
-                    media_item_latents, l_x, l_y = self._get_latent_spatial_position(
-                        media_item_latents,
-                        conditioning_item,
-                        height,
-                        width,
-                        strip_latent_border=True,
                     )
-                    b, c_l, f_l, h_l, w_l = media_item_latents.shape
-                    # First frame or sequence - just update the initial noise latents and the mask
-                    init_latents[:, :, :f_l, l_y : l_y + h_l, l_x : l_x + w_l] = (
-                        torch.lerp(
-                            init_latents[:, :, :f_l, l_y : l_y + h_l, l_x : l_x + w_l],
-                            media_item_latents,
-                            strength,
-                        )
                     )
-                    init_conditioning_mask[
-                        :, :f_l, l_y : l_y + h_l, l_x : l_x + w_l
-                    ] = strength
                 else:
-                    # Non-first frame or sequence
-                    if n_frames > 1:
-                        # Handle non-first sequence.
-                        # Encoded latents are either fully consumed, or the prefix is handled separately below.
-                        (
-                            init_latents,
-                            init_conditioning_mask,
-                            media_item_latents,
-                        ) = self._handle_non_first_conditioning_sequence(
-                            init_latents,
-                            init_conditioning_mask,
-                            media_item_latents,
-                            media_frame_number,
-                            strength,
                         )
-                    # Single frame or sequence-prefix latents
-                    if media_item_latents is not None:
                         noise = randn_tensor(
-                            media_item_latents.shape,
                             generator=generator,
-                            device=media_item_latents.device,
-                            dtype=media_item_latents.dtype,
                         )
-                        media_item_latents = torch.lerp(
-                            noise, media_item_latents, strength
-                        )
-                        # Patchify the extra conditioning latents and calculate their pixel coordinates
-                        media_item_latents, latent_coords = self.patchifier.patchify(
-                            latents=media_item_latents
                         )
                         pixel_coords = latent_to_pixel_coords(
                             latent_coords,
                             self.vae,
                             causal_fix=self.transformer.config.causal_temporal_positioning,
                         )
-                        # Update the frame numbers to match the target frame number
-                        pixel_coords[:, 0] += media_frame_number
-                        extra_conditioning_num_latents += media_item_latents.shape[1]
-                        conditioning_mask = torch.full(
-                            media_item_latents.shape[:2],
                             strength,
                             dtype=torch.float32,
                             device=init_latents.device,
                         )
-                        extra_conditioning_latents.append(media_item_latents)
                         extra_conditioning_pixel_coords.append(pixel_coords)
-                        extra_conditioning_mask.append(conditioning_mask)
-        # Patchify the updated latents and calculate their pixel coordinates
-        init_latents, init_latent_coords = self.patchifier.patchify(
-            latents=init_latents
-        )
         init_pixel_coords = latent_to_pixel_coords(
             init_latent_coords,
             self.vae,
             causal_fix=self.transformer.config.causal_temporal_positioning,
         )
         if not conditioning_items:
             return init_latents, init_pixel_coords, None, 0
         init_conditioning_mask, _ = self.patchifier.patchify(
             latents=init_conditioning_mask.unsqueeze(1)
         )
         init_conditioning_mask = init_conditioning_mask.squeeze(-1)
         if extra_conditioning_latents:
-            # Stack the extra conditioning latents, pixel coordinates and mask
             init_latents = torch.cat([*extra_conditioning_latents, init_latents], dim=1)
             init_pixel_coords = torch.cat(
                 [*extra_conditioning_pixel_coords, init_pixel_coords], dim=2
@@ -1633,25 +1537,15 @@ class LTXVideoPipeline(DiffusionPipeline):
             init_conditioning_mask = torch.cat(
                 [*extra_conditioning_mask, init_conditioning_mask], dim=1
             )
             if self.transformer.use_tpu_flash_attention:
-                # When flash attention is used, keep the original number of tokens by removing
-                #   tokens from the end.
                 init_latents = init_latents[:, :-extra_conditioning_num_latents]
-                init_pixel_coords = init_pixel_coords[
-                    :, :, :-extra_conditioning_num_latents
-                ]
-                init_conditioning_mask = init_conditioning_mask[
-                    :, :-extra_conditioning_num_latents
-                ]
-        return (
-            init_latents,
-            init_pixel_coords,
-            init_conditioning_mask,
-            extra_conditioning_num_latents,
-        )
     @staticmethod
     def _resize_conditioning_item(
         conditioning_item: ConditioningItem,

     return timesteps, num_inference_steps
+from typing import Union, Optional
+from PIL import Image, ImageOps
+import torch
+from dataclasses import dataclass
 @dataclass
+class ConditioningItem:
+    media_item_latents: torch.Tensor
     media_frame_number: int
     conditioning_strength: float
+    media_x: Optional[int] = None
+    media_y: Optional[int] = None
+def encode_conditioning_item(
+    self,
+    raw_item: Union[Image.Image, torch.Tensor],
+    frame_number: int,
+    strength: float,
+    height: int,
+    width: int,
+    vae_per_channel_normalize: bool = False,
+) -> ConditioningItem:
     """
+    Converte PIL Image ou tensor latente em ConditioningItem com latentes codificados.
+    Args:
+        raw_item: PIL.Image.Image ou torch.Tensor ([B, C, f, H, W] ou [B, C, H, W]).
+        frame_number: índice inicial no vídeo.
+        strength: peso de condicionamento (0.0–1.0).
+        height, width: resolução alvo do vídeo.
+        vae_per_channel_normalize: normalize nos canais do VAE.
+    Retorna:
+        ConditioningItem com media_item_latents corretamente formatados.
     """
+    # 1) Se for PIL, redimensiona e converte em latentes de pixel
+    if isinstance(raw_item, Image.Image):
+        pil = ImageOps.fit(raw_item, (width, height), Image.LANCZOS)
+        # image_to_latents: converte PIL→tensor [B, C, H_lat, W_lat]
+        pixel_latents = image_to_latents(pil)  # fornecido pelo seu utilitário
+        # adiciona dimensão de frame se necessário
+        if pixel_latents.ndim == 4:
+            pixel_latents = pixel_latents.unsqueeze(2)  # [B, C, 1, H_lat, W_lat]
+        latents = pixel_latents.to(dtype=self.vae.dtype, device=self.vae.device)
+        # codifica via VAE de vídeo
+        latents = vae_encode(
+            latents,
+            self.vae,
+            vae_per_channel_normalize=vae_per_channel_normalize,
+        ).to(dtype=latents.dtype, device=latents.device)
+    # 2) Se já for tensor de latentes
+    elif isinstance(raw_item, torch.Tensor):
+        latents = raw_item
+        # opcional: validar shape == (B, C, f, H_lat, W_lat)
+    else:
+        raise TypeError(f"Tipo não suportado: {type(raw_item)}")
+    return ConditioningItem(
+        media_item_latents=latents,
+        media_frame_number=frame_number,
+        conditioning_strength=strength,
+    )
 class LTXVideoPipeline(DiffusionPipeline):
     def prepare_conditioning(
         self,
         conditioning_items: Optional[List[ConditioningItem]],
         init_latents: torch.Tensor,
         num_frames: int,
         height: int,
         width: int,
         generator=None,
+    ) -> Tuple[torch.Tensor, torch.Tensor, Optional[torch.Tensor], int]:
         assert isinstance(self.vae, CausalVideoAutoencoder)
         if conditioning_items:
             batch_size, _, num_latent_frames = init_latents.shape[:3]
             init_conditioning_mask = torch.zeros(
                 init_latents[:, 0, :, :, :].shape,
                 dtype=torch.float32,
                 device=init_latents.device,
             )
             extra_conditioning_latents = []
             extra_conditioning_pixel_coords = []
             extra_conditioning_mask = []
+            extra_conditioning_num_latents = 0
+            for item in conditioning_items:
+                media_latents = item.media_item_latents.to(
+                    dtype=init_latents.dtype, device=init_latents.device
                 )
+                strength = item.conditioning_strength
+                frame_idx = item.media_frame_number
+                if frame_idx == 0:
+                    # posicionamento espacial
+                    media_latents, l_x, l_y = self._get_latent_spatial_position(
+                        media_latents, item, height, width, strip_latent_border=True
                     )
+                    b, c_l, f_l, h_l, w_l = media_latents.shape
+                    init_latents[:, :, :f_l, l_y : l_y + h_l, l_x : l_x + w_l] = torch.lerp(
+                        init_latents[:, :, :f_l, l_y : l_y + h_l, l_x : l_x + w_l],
+                        media_latents,
+                        strength,
                     )
+                    init_conditioning_mask[:, :f_l, l_y : l_y + h_l, l_x : l_x + w_l] = strength
                 else:
+                    # sequências não iniciais
+                    if media_latents.shape[2] > 1:
+                        init_latents, init_conditioning_mask, media_latents = (
+                            self._handle_non_first_conditioning_sequence(
+                                init_latents,
+                                init_conditioning_mask,
+                                media_latents,
+                                frame_idx,
+                                strength,
+                            )
                         )
+                    if media_latents is not None:
                         noise = randn_tensor(
+                            media_latents.shape,
                             generator=generator,
+                            device=media_latents.device,
+                            dtype=media_latents.dtype,
                         )
+                        media_latents = torch.lerp(noise, media_latents, strength)
+                        # patchify
+                        media_latents, latent_coords = self.patchifier.patchify(
+                            latents=media_latents
                         )
                         pixel_coords = latent_to_pixel_coords(
                             latent_coords,
                             self.vae,
                             causal_fix=self.transformer.config.causal_temporal_positioning,
                         )
+                        pixel_coords[:, 0] += frame_idx
+                        extra_conditioning_num_latents += media_latents.shape[1]
+                        mask = torch.full(
+                            media_latents.shape[:2],
                             strength,
                             dtype=torch.float32,
                             device=init_latents.device,
                         )
+                        extra_conditioning_latents.append(media_latents)
                         extra_conditioning_pixel_coords.append(pixel_coords)
+                        extra_conditioning_mask.append(mask)
+        # patchify init_latents
+        init_latents, init_latent_coords = self.patchifier.patchify(latents=init_latents)
         init_pixel_coords = latent_to_pixel_coords(
             init_latent_coords,
             self.vae,
             causal_fix=self.transformer.config.causal_temporal_positioning,
         )
         if not conditioning_items:
             return init_latents, init_pixel_coords, None, 0
+        # patchify mask
         init_conditioning_mask, _ = self.patchifier.patchify(
             latents=init_conditioning_mask.unsqueeze(1)
         )
         init_conditioning_mask = init_conditioning_mask.squeeze(-1)
         if extra_conditioning_latents:
             init_latents = torch.cat([*extra_conditioning_latents, init_latents], dim=1)
             init_pixel_coords = torch.cat(
                 [*extra_conditioning_pixel_coords, init_pixel_coords], dim=2
             init_conditioning_mask = torch.cat(
                 [*extra_conditioning_mask, init_conditioning_mask], dim=1
             )
             if self.transformer.use_tpu_flash_attention:
                 init_latents = init_latents[:, :-extra_conditioning_num_latents]
+                init_pixel_coords = init_pixel_coords[:, :, :-extra_conditioning_num_latents]
+                init_conditioning_mask = init_conditioning_mask[:, :-extra_conditioning_num_latents]
+        return init_latents, init_pixel_coords, init_conditioning_mask, extra_conditioning_num_latents
     @staticmethod
     def _resize_conditioning_item(
         conditioning_item: ConditioningItem,