Upload pipeline_ltx_video.py
Browse files- pipeline_ltx_video.py +19 -56
pipeline_ltx_video.py
CHANGED
|
@@ -24,15 +24,6 @@ from transformers import (
|
|
| 24 |
AutoTokenizer,
|
| 25 |
)
|
| 26 |
|
| 27 |
-
|
| 28 |
-
from huggingface_hub import logging
|
| 29 |
-
|
| 30 |
-
|
| 31 |
-
logging.set_verbosity_error()
|
| 32 |
-
logging.set_verbosity_warning()
|
| 33 |
-
logging.set_verbosity_info()
|
| 34 |
-
logging.set_verbosity_debug()
|
| 35 |
-
|
| 36 |
from ltx_video.models.autoencoders.causal_video_autoencoder import (
|
| 37 |
CausalVideoAutoencoder,
|
| 38 |
)
|
|
@@ -54,8 +45,7 @@ from ltx_video.models.autoencoders.vae_encode import (
|
|
| 54 |
)
|
| 55 |
|
| 56 |
|
| 57 |
-
logger = logging.get_logger(__name__)
|
| 58 |
-
|
| 59 |
|
| 60 |
|
| 61 |
ASPECT_RATIO_1024_BIN = {
|
|
@@ -933,9 +923,6 @@ class LTXVideoPipeline(DiffusionPipeline):
|
|
| 933 |
latent_height,
|
| 934 |
latent_width,
|
| 935 |
)
|
| 936 |
-
|
| 937 |
-
print(f"[ltxxxxxxxx] latent_shape {latent_shape}")
|
| 938 |
-
|
| 939 |
|
| 940 |
# Prepare the list of denoising time-steps
|
| 941 |
|
|
@@ -968,60 +955,38 @@ class LTXVideoPipeline(DiffusionPipeline):
|
|
| 968 |
timestep in self.allowed_inference_steps
|
| 969 |
), f"Invalid inference timestep {timestep}. Allowed timesteps are {self.allowed_inference_steps}."
|
| 970 |
|
| 971 |
-
|
| 972 |
-
# Versão Corrigida em pipeline_ltx_video.py
|
| 973 |
if guidance_timesteps:
|
| 974 |
guidance_mapping = []
|
| 975 |
for timestep in timesteps:
|
| 976 |
indices = [
|
| 977 |
i for i, val in enumerate(guidance_timesteps) if val <= timestep
|
| 978 |
]
|
|
|
|
| 979 |
guidance_mapping.append(
|
| 980 |
indices[0] if len(indices) > 0 else (len(guidance_timesteps) - 1)
|
| 981 |
)
|
| 982 |
-
|
| 983 |
-
# CORREÇÃO: Garante que guidance_mapping exista mesmo se guidance_timesteps não for fornecido.
|
| 984 |
-
# Cria um mapeamento de identidade seguro.
|
| 985 |
-
guidance_mapping = list(range(len(timesteps)))
|
| 986 |
-
|
| 987 |
-
print(f"[ltxxxxxxxx] guidance_mapping {guidance_mapping}")
|
| 988 |
-
|
| 989 |
# here `guidance_scale` is defined analog to the guidance weight `w` of equation (2)
|
| 990 |
# of the Imagen paper: https://arxiv.org/pdf/2205.11487.pdf . `guidance_scale = 1`
|
| 991 |
# corresponds to doing no classifier free guidance.
|
| 992 |
-
|
| 993 |
-
|
| 994 |
-
|
| 995 |
-
|
| 996 |
-
|
| 997 |
-
|
| 998 |
-
# Caso normal: remapeia usando a guidance_mapping
|
| 999 |
-
else:
|
| 1000 |
-
return [param[guidance_mapping[i]] for i in range(len(timesteps))]
|
| 1001 |
-
# Caso de valor único (float/int): estica para o tamanho de timesteps
|
| 1002 |
-
else:
|
| 1003 |
-
return [param] * len(timesteps)
|
| 1004 |
|
| 1005 |
-
|
| 1006 |
-
|
| 1007 |
-
guidance_mapping = []
|
| 1008 |
-
for timestep in timesteps:
|
| 1009 |
-
indices = [i for i, val in enumerate(guidance_timesteps) if val >= timestep]
|
| 1010 |
-
guidance_mapping.append(indices[-1] if indices else len(guidance_timesteps) - 1)
|
| 1011 |
else:
|
| 1012 |
-
|
| 1013 |
-
|
| 1014 |
-
|
| 1015 |
-
|
| 1016 |
-
|
| 1017 |
-
|
| 1018 |
-
|
| 1019 |
-
|
| 1020 |
-
if skip_block_list is not None:
|
| 1021 |
-
if len(skip_block_list) > 0 and not isinstance(skip_block_list[0], list):
|
| 1022 |
-
skip_block_list = [skip_block_list] * len(timesteps)
|
| 1023 |
-
elif isinstance(skip_block_list, list) and len(skip_block_list) > 1: # Apenas remapeia se for uma lista de listas
|
| 1024 |
-
skip_block_list = [skip_block_list[guidance_mapping[i]] for i in range(len(timesteps))]
|
| 1025 |
|
| 1026 |
# Normalize skip_block_list to always be None or a list of lists matching timesteps
|
| 1027 |
if skip_block_list is not None:
|
|
@@ -1116,8 +1081,6 @@ class LTXVideoPipeline(DiffusionPipeline):
|
|
| 1116 |
generator=generator,
|
| 1117 |
vae_per_channel_normalize=vae_per_channel_normalize,
|
| 1118 |
)
|
| 1119 |
-
|
| 1120 |
-
|
| 1121 |
|
| 1122 |
# Update the latents with the conditioning items and patchify them into (b, n, c)
|
| 1123 |
latents, pixel_coords, conditioning_mask, num_cond_latents = (
|
|
|
|
| 24 |
AutoTokenizer,
|
| 25 |
)
|
| 26 |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 27 |
from ltx_video.models.autoencoders.causal_video_autoencoder import (
|
| 28 |
CausalVideoAutoencoder,
|
| 29 |
)
|
|
|
|
| 45 |
)
|
| 46 |
|
| 47 |
|
| 48 |
+
logger = logging.get_logger(__name__) # pylint: disable=invalid-name
|
|
|
|
| 49 |
|
| 50 |
|
| 51 |
ASPECT_RATIO_1024_BIN = {
|
|
|
|
| 923 |
latent_height,
|
| 924 |
latent_width,
|
| 925 |
)
|
|
|
|
|
|
|
|
|
|
| 926 |
|
| 927 |
# Prepare the list of denoising time-steps
|
| 928 |
|
|
|
|
| 955 |
timestep in self.allowed_inference_steps
|
| 956 |
), f"Invalid inference timestep {timestep}. Allowed timesteps are {self.allowed_inference_steps}."
|
| 957 |
|
|
|
|
|
|
|
| 958 |
if guidance_timesteps:
|
| 959 |
guidance_mapping = []
|
| 960 |
for timestep in timesteps:
|
| 961 |
indices = [
|
| 962 |
i for i, val in enumerate(guidance_timesteps) if val <= timestep
|
| 963 |
]
|
| 964 |
+
# assert len(indices) > 0, f"No guidance timestep found for {timestep}"
|
| 965 |
guidance_mapping.append(
|
| 966 |
indices[0] if len(indices) > 0 else (len(guidance_timesteps) - 1)
|
| 967 |
)
|
| 968 |
+
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 969 |
# here `guidance_scale` is defined analog to the guidance weight `w` of equation (2)
|
| 970 |
# of the Imagen paper: https://arxiv.org/pdf/2205.11487.pdf . `guidance_scale = 1`
|
| 971 |
# corresponds to doing no classifier free guidance.
|
| 972 |
+
if not isinstance(guidance_scale, List):
|
| 973 |
+
guidance_scale = [guidance_scale] * len(timesteps)
|
| 974 |
+
else:
|
| 975 |
+
guidance_scale = [
|
| 976 |
+
guidance_scale[guidance_mapping[i]] for i in range(len(timesteps))
|
| 977 |
+
]
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 978 |
|
| 979 |
+
if not isinstance(stg_scale, List):
|
| 980 |
+
stg_scale = [stg_scale] * len(timesteps)
|
|
|
|
|
|
|
|
|
|
|
|
|
| 981 |
else:
|
| 982 |
+
stg_scale = [stg_scale[guidance_mapping[i]] for i in range(len(timesteps))]
|
| 983 |
+
|
| 984 |
+
if not isinstance(rescaling_scale, List):
|
| 985 |
+
rescaling_scale = [rescaling_scale] * len(timesteps)
|
| 986 |
+
else:
|
| 987 |
+
rescaling_scale = [
|
| 988 |
+
rescaling_scale[guidance_mapping[i]] for i in range(len(timesteps))
|
| 989 |
+
]
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 990 |
|
| 991 |
# Normalize skip_block_list to always be None or a list of lists matching timesteps
|
| 992 |
if skip_block_list is not None:
|
|
|
|
| 1081 |
generator=generator,
|
| 1082 |
vae_per_channel_normalize=vae_per_channel_normalize,
|
| 1083 |
)
|
|
|
|
|
|
|
| 1084 |
|
| 1085 |
# Update the latents with the conditioning items and patchify them into (b, n, c)
|
| 1086 |
latents, pixel_coords, conditioning_mask, num_cond_latents = (
|