Update deformes4D_engine.py
Browse files- deformes4D_engine.py +32 -24
deformes4D_engine.py
CHANGED
|
@@ -52,6 +52,7 @@ class Deformes4DEngine:
|
|
| 52 |
self._vae.to(self.device); self._vae.eval()
|
| 53 |
return self._vae
|
| 54 |
|
|
|
|
| 55 |
def save_latent_tensor(self, tensor: torch.Tensor, path: str):
|
| 56 |
torch.save(tensor.cpu(), path)
|
| 57 |
logger.info(f"Tensor latente salvo em: {path}")
|
|
@@ -158,10 +159,8 @@ class Deformes4DEngine:
|
|
| 158 |
progress: gr.Progress = gr.Progress()):
|
| 159 |
|
| 160 |
base_ltx_params = {
|
| 161 |
-
"guidance_scale": 1.0,
|
| 162 |
-
"
|
| 163 |
-
"rescaling_scale": 0.15,
|
| 164 |
-
"num_inference_steps": 7,
|
| 165 |
}
|
| 166 |
|
| 167 |
keyframe_paths = [item[0] if isinstance(item, tuple) else item for item in keyframes]
|
|
@@ -169,7 +168,10 @@ class Deformes4DEngine:
|
|
| 169 |
target_resolution_tuple = (video_resolution, video_resolution)
|
| 170 |
n_trim_latents = self._quantize_to_multiple(int(seconds_per_fragment * 24 * (overlap_percent / 100.0)), 8)
|
| 171 |
|
| 172 |
-
|
|
|
|
|
|
|
|
|
|
| 173 |
num_transitions_to_generate = len(keyframe_paths) - 1
|
| 174 |
|
| 175 |
for i in range(num_transitions_to_generate):
|
|
@@ -179,8 +181,9 @@ class Deformes4DEngine:
|
|
| 179 |
destination_keyframe_path = keyframe_paths[i+1]
|
| 180 |
present_scene_desc = storyboard[i]
|
| 181 |
|
| 182 |
-
is_first_fragment =
|
| 183 |
|
|
|
|
| 184 |
if is_first_fragment:
|
| 185 |
transition_type = "start"
|
| 186 |
motion_prompt = gemini_singleton.get_initial_motion_prompt(
|
|
@@ -224,6 +227,7 @@ class Deformes4DEngine:
|
|
| 224 |
current_ltx_params = {**base_ltx_params, "handler_strength": handler_strength, "motion_prompt": motion_prompt}
|
| 225 |
total_frames_to_generate = self._quantize_to_multiple(int(seconds_per_fragment * 24), 8) + 1
|
| 226 |
|
|
|
|
| 227 |
if is_first_fragment:
|
| 228 |
img_start = self._preprocess_image_for_latent_conversion(Image.open(start_keyframe_path).convert("RGB"), target_resolution_tuple)
|
| 229 |
start_latent = self.pil_to_latent(img_start)
|
|
@@ -233,15 +237,9 @@ class Deformes4DEngine:
|
|
| 233 |
destination_latent = self.pil_to_latent(img_dest)
|
| 234 |
conditioning_items.append(LatentConditioningItem(destination_latent, total_frames_to_generate - 1, destination_convergence_strength))
|
| 235 |
else:
|
| 236 |
-
|
| 237 |
-
|
| 238 |
-
|
| 239 |
-
echo_latents = trimmed_for_echo[:, :, -echo_frames:, :, :]
|
| 240 |
-
handler_frame_position = n_trim_latents + echo_frames
|
| 241 |
-
|
| 242 |
-
conditioning_items.append(LatentConditioningItem(echo_latents, 0, 1.0))
|
| 243 |
-
conditioning_items.append(LatentConditioningItem(handler_latent, handler_frame_position, handler_strength))
|
| 244 |
-
del previous_latents, handler_latent, trimmed_for_echo, echo_latents; gc.collect()
|
| 245 |
if transition_type == "continuous":
|
| 246 |
img_dest = self._preprocess_image_for_latent_conversion(Image.open(destination_keyframe_path).convert("RGB"), target_resolution_tuple)
|
| 247 |
destination_latent = self.pil_to_latent(img_dest)
|
|
@@ -249,21 +247,31 @@ class Deformes4DEngine:
|
|
| 249 |
|
| 250 |
new_full_latents = self._generate_latent_tensor_internal(conditioning_items, current_ltx_params, target_resolution_tuple, total_frames_to_generate)
|
| 251 |
|
| 252 |
-
|
| 253 |
-
|
| 254 |
-
self.save_latent_tensor(new_full_latents, new_full_latents_path)
|
| 255 |
|
| 256 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 257 |
|
| 258 |
-
|
| 259 |
-
|
| 260 |
video_with_audio_path = self._generate_video_and_audio_from_latents(latents_for_video, audio_prompt, base_name)
|
| 261 |
video_clips_paths.append(video_with_audio_path)
|
| 262 |
|
| 263 |
-
|
| 264 |
if transition_type == "cut":
|
| 265 |
-
|
| 266 |
-
|
|
|
|
| 267 |
|
| 268 |
yield {"fragment_path": video_with_audio_path}
|
| 269 |
|
|
|
|
| 52 |
self._vae.to(self.device); self._vae.eval()
|
| 53 |
return self._vae
|
| 54 |
|
| 55 |
+
# ... (métodos auxiliares como save/load/pixels_to_latents permanecem iguais) ...
|
| 56 |
def save_latent_tensor(self, tensor: torch.Tensor, path: str):
|
| 57 |
torch.save(tensor.cpu(), path)
|
| 58 |
logger.info(f"Tensor latente salvo em: {path}")
|
|
|
|
| 159 |
progress: gr.Progress = gr.Progress()):
|
| 160 |
|
| 161 |
base_ltx_params = {
|
| 162 |
+
"guidance_scale": 1.0, "stg_scale": 0.0,
|
| 163 |
+
"rescaling_scale": 0.15, "num_inference_steps": 7,
|
|
|
|
|
|
|
| 164 |
}
|
| 165 |
|
| 166 |
keyframe_paths = [item[0] if isinstance(item, tuple) else item for item in keyframes]
|
|
|
|
| 168 |
target_resolution_tuple = (video_resolution, video_resolution)
|
| 169 |
n_trim_latents = self._quantize_to_multiple(int(seconds_per_fragment * 24 * (overlap_percent / 100.0)), 8)
|
| 170 |
|
| 171 |
+
# --- NOVA LÓGICA: Variáveis para guardar os tensores de continuidade ---
|
| 172 |
+
prepared_echo_latent = None
|
| 173 |
+
prepared_handler_latent = None
|
| 174 |
+
|
| 175 |
num_transitions_to_generate = len(keyframe_paths) - 1
|
| 176 |
|
| 177 |
for i in range(num_transitions_to_generate):
|
|
|
|
| 181 |
destination_keyframe_path = keyframe_paths[i+1]
|
| 182 |
present_scene_desc = storyboard[i]
|
| 183 |
|
| 184 |
+
is_first_fragment = (prepared_handler_latent is None)
|
| 185 |
|
| 186 |
+
# ... (Lógica de decisão do Gemini e do diretor de som permanece a mesma) ...
|
| 187 |
if is_first_fragment:
|
| 188 |
transition_type = "start"
|
| 189 |
motion_prompt = gemini_singleton.get_initial_motion_prompt(
|
|
|
|
| 227 |
current_ltx_params = {**base_ltx_params, "handler_strength": handler_strength, "motion_prompt": motion_prompt}
|
| 228 |
total_frames_to_generate = self._quantize_to_multiple(int(seconds_per_fragment * 24), 8) + 1
|
| 229 |
|
| 230 |
+
# --- NOVA LÓGICA: Preparação das instruções de condicionamento ---
|
| 231 |
if is_first_fragment:
|
| 232 |
img_start = self._preprocess_image_for_latent_conversion(Image.open(start_keyframe_path).convert("RGB"), target_resolution_tuple)
|
| 233 |
start_latent = self.pil_to_latent(img_start)
|
|
|
|
| 237 |
destination_latent = self.pil_to_latent(img_dest)
|
| 238 |
conditioning_items.append(LatentConditioningItem(destination_latent, total_frames_to_generate - 1, destination_convergence_strength))
|
| 239 |
else:
|
| 240 |
+
# Usa os tensores pré-preparados da iteração anterior
|
| 241 |
+
conditioning_items.append(LatentConditioningItem(prepared_echo_latent, 0, 1.0))
|
| 242 |
+
conditioning_items.append(LatentConditioningItem(prepared_handler_latent, echo_frames, handler_strength))
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 243 |
if transition_type == "continuous":
|
| 244 |
img_dest = self._preprocess_image_for_latent_conversion(Image.open(destination_keyframe_path).convert("RGB"), target_resolution_tuple)
|
| 245 |
destination_latent = self.pil_to_latent(img_dest)
|
|
|
|
| 247 |
|
| 248 |
new_full_latents = self._generate_latent_tensor_internal(conditioning_items, current_ltx_params, target_resolution_tuple, total_frames_to_generate)
|
| 249 |
|
| 250 |
+
# --- NOVA LÓGICA: Preparação movida para o final do loop ---
|
| 251 |
+
is_last_fragment = (i == num_transitions_to_generate - 1)
|
|
|
|
| 252 |
|
| 253 |
+
if not is_last_fragment:
|
| 254 |
+
# ANTECIPAÇÃO: Prepara os tensores para a PRÓXIMA iteração
|
| 255 |
+
prepared_handler_latent = new_full_latents[:, :, -1:, :, :].clone()
|
| 256 |
+
prepared_echo_latent = new_full_latents[:, :, -echo_frames:, :, :].clone()
|
| 257 |
+
|
| 258 |
+
# CORTE NO FIM: Define os latentes para o VÍDEO ATUAL, removendo a sobreposição
|
| 259 |
+
if n_trim_latents > 0 and new_full_latents.shape[2] > n_trim_latents:
|
| 260 |
+
latents_for_video = new_full_latents[:, :, :-n_trim_latents, :, :]
|
| 261 |
+
else:
|
| 262 |
+
latents_for_video = new_full_latents
|
| 263 |
+
else:
|
| 264 |
+
# O último fragmento não precisa preparar nada para o futuro, então renderiza-se por completo.
|
| 265 |
+
latents_for_video = new_full_latents
|
| 266 |
|
| 267 |
+
base_name = f"fragment_{i}_{int(time.time())}"
|
|
|
|
| 268 |
video_with_audio_path = self._generate_video_and_audio_from_latents(latents_for_video, audio_prompt, base_name)
|
| 269 |
video_clips_paths.append(video_with_audio_path)
|
| 270 |
|
|
|
|
| 271 |
if transition_type == "cut":
|
| 272 |
+
# Se for um corte, limpa a memória para a próxima iteração começar do zero.
|
| 273 |
+
prepared_echo_latent = None
|
| 274 |
+
prepared_handler_latent = None
|
| 275 |
|
| 276 |
yield {"fragment_path": video_with_audio_path}
|
| 277 |
|