Update deformes4D_engine.py
Browse files- deformes4D_engine.py +32 -38
deformes4D_engine.py
CHANGED
|
@@ -79,11 +79,11 @@ class Deformes4DEngine:
|
|
| 79 |
video_np = (video_tensor.detach().cpu().float().numpy() * 255).astype(np.uint8)
|
| 80 |
with imageio.get_writer(path, fps=fps, codec='libx264', quality=8) as writer:
|
| 81 |
for frame in video_np: writer.append_data(frame)
|
| 82 |
-
logger.info(f"
|
| 83 |
|
| 84 |
def _preprocess_image_for_latent_conversion(self, image: Image.Image, target_resolution: tuple) -> Image.Image:
|
| 85 |
if image.size != target_resolution:
|
| 86 |
-
logger.info(f" -
|
| 87 |
return ImageOps.fit(image, target_resolution, Image.Resampling.LANCZOS)
|
| 88 |
return image
|
| 89 |
|
|
@@ -92,27 +92,27 @@ class Deformes4DEngine:
|
|
| 92 |
tensor = torch.from_numpy(image_np).permute(2, 0, 1).unsqueeze(0).unsqueeze(2)
|
| 93 |
tensor = (tensor * 2.0) - 1.0
|
| 94 |
return self.pixels_to_latents(tensor)
|
| 95 |
-
|
| 96 |
def _generate_video_and_audio_from_latents(self, latent_tensor, audio_prompt, base_name):
|
| 97 |
silent_video_path = os.path.join(self.workspace_dir, f"{base_name}_silent.mp4")
|
| 98 |
pixel_tensor = self.latents_to_pixels(latent_tensor)
|
| 99 |
self.save_video_from_tensor(pixel_tensor, silent_video_path, fps=24)
|
| 100 |
del pixel_tensor; gc.collect()
|
| 101 |
-
|
| 102 |
try:
|
| 103 |
result = subprocess.run(
|
| 104 |
["ffprobe", "-v", "error", "-show_entries", "format=duration", "-of", "default=noprint_wrappers=1:nokey=1", silent_video_path],
|
| 105 |
capture_output=True, text=True, check=True)
|
| 106 |
frag_duration = float(result.stdout.strip())
|
| 107 |
except (subprocess.CalledProcessError, ValueError, FileNotFoundError):
|
| 108 |
-
logger.warning(f"ffprobe falhou em {os.path.basename(silent_video_path)}. Calculando
|
| 109 |
num_pixel_frames = latent_tensor.shape[2] * 8
|
| 110 |
frag_duration = num_pixel_frames / 24.0
|
| 111 |
|
| 112 |
video_with_audio_path = audio_specialist_singleton.generate_audio_for_video(
|
| 113 |
video_path=silent_video_path, prompt=audio_prompt,
|
| 114 |
duration_seconds=frag_duration)
|
| 115 |
-
|
| 116 |
if os.path.exists(silent_video_path):
|
| 117 |
os.remove(silent_video_path)
|
| 118 |
return video_with_audio_path
|
|
@@ -130,20 +130,20 @@ class Deformes4DEngine:
|
|
| 130 |
|
| 131 |
def concatenate_videos_ffmpeg(self, video_paths: list[str], output_path: str) -> str:
|
| 132 |
if not video_paths:
|
| 133 |
-
raise gr.Error("Nenhum fragmento de
|
| 134 |
list_file_path = os.path.join(self.workspace_dir, "concat_list.txt")
|
| 135 |
with open(list_file_path, 'w', encoding='utf-8') as f:
|
| 136 |
for path in video_paths:
|
| 137 |
f.write(f"file '{os.path.abspath(path)}'\n")
|
| 138 |
cmd_list = ['ffmpeg', '-y', '-f', 'concat', '-safe', '0', '-i', list_file_path, '-c', 'copy', output_path]
|
| 139 |
-
logger.info("Executando
|
| 140 |
try:
|
| 141 |
subprocess.run(cmd_list, check=True, capture_output=True, text=True)
|
| 142 |
except subprocess.CalledProcessError as e:
|
| 143 |
logger.error(f"Erro no FFmpeg: {e.stderr}")
|
| 144 |
-
raise gr.Error(f"Falha na montagem final do
|
| 145 |
return output_path
|
| 146 |
-
|
| 147 |
def generate_full_movie(self,
|
| 148 |
keyframes: list,
|
| 149 |
global_prompt: str,
|
|
@@ -157,18 +157,18 @@ class Deformes4DEngine:
|
|
| 157 |
video_resolution: int,
|
| 158 |
use_continuity_director: bool,
|
| 159 |
progress: gr.Progress = gr.Progress()):
|
| 160 |
-
|
| 161 |
keyframe_paths = [item[0] if isinstance(item, tuple) else item for item in keyframes]
|
| 162 |
video_clips_paths, story_history, audio_history = [], "", "This is the beginning of the film."
|
| 163 |
target_resolution_tuple = (video_resolution, video_resolution)
|
| 164 |
n_trim_latents = self._quantize_to_multiple(int(seconds_per_fragment * 24 * (overlap_percent / 100.0)), 8)
|
| 165 |
#echo_frames = 8
|
| 166 |
-
|
| 167 |
previous_latents_path = None
|
| 168 |
num_transitions_to_generate = len(keyframe_paths) - 1
|
| 169 |
-
|
| 170 |
for i in range(num_transitions_to_generate):
|
| 171 |
-
progress((i + 1) / num_transitions_to_generate, desc=f"Produzindo
|
| 172 |
|
| 173 |
start_keyframe_path = keyframe_paths[i]
|
| 174 |
destination_keyframe_path = keyframe_paths[i+1]
|
|
@@ -191,10 +191,10 @@ class Deformes4DEngine:
|
|
| 191 |
present_scene_desc=present_scene_desc, future_scene_desc=future_scene_desc
|
| 192 |
)
|
| 193 |
transition_type, motion_prompt = decision["transition_type"], decision["motion_prompt"]
|
| 194 |
-
|
| 195 |
story_history += f"\n- Ato {i+1} ({transition_type}): {motion_prompt}"
|
| 196 |
|
| 197 |
-
if use_continuity_director: # Assume-se que este checkbox controla os diretores de
|
| 198 |
if is_first_fragment:
|
| 199 |
audio_prompt = gemini_singleton.get_sound_director_prompt(
|
| 200 |
audio_history=audio_history,
|
|
@@ -211,7 +211,7 @@ class Deformes4DEngine:
|
|
| 211 |
)
|
| 212 |
else:
|
| 213 |
audio_prompt = present_scene_desc # Fallback para o prompt da cena se o diretor de som estiver desligado
|
| 214 |
-
|
| 215 |
audio_history = audio_prompt
|
| 216 |
|
| 217 |
conditioning_items = []
|
|
@@ -232,8 +232,8 @@ class Deformes4DEngine:
|
|
| 232 |
trimmed_for_echo = previous_latents[:, :, :-n_trim_latents, :, :] if n_trim_latents > 0 and previous_latents.shape[2] > n_trim_latents else previous_latents
|
| 233 |
echo_latents = trimmed_for_echo[:, :, -echo_frames:, :, :]
|
| 234 |
handler_frame_position = n_trim_latents + echo_frames
|
| 235 |
-
|
| 236 |
-
|
| 237 |
conditioning_items.append(LatentConditioningItem(echo_latents, 0, 1.0))
|
| 238 |
conditioning_items.append(LatentConditioningItem(handler_latent, handler_frame_position, handler_strength))
|
| 239 |
del previous_latents, handler_latent, trimmed_for_echo, echo_latents; gc.collect()
|
|
@@ -241,42 +241,36 @@ class Deformes4DEngine:
|
|
| 241 |
img_dest = self._preprocess_image_for_latent_conversion(Image.open(destination_keyframe_path).convert("RGB"), target_resolution_tuple)
|
| 242 |
destination_latent = self.pil_to_latent(img_dest)
|
| 243 |
conditioning_items.append(LatentConditioningItem(destination_latent, total_frames_to_generate - 1, destination_convergence_strength))
|
| 244 |
-
|
| 245 |
new_full_latents = self._generate_latent_tensor_internal(conditioning_items, current_ltx_params, target_resolution_tuple, total_frames_to_generate)
|
| 246 |
-
|
| 247 |
base_name = f"fragment_{i}_{int(time.time())}"
|
| 248 |
new_full_latents_path = os.path.join(self.workspace_dir, f"{base_name}_full.pt")
|
| 249 |
self.save_latent_tensor(new_full_latents, new_full_latents_path)
|
| 250 |
-
|
| 251 |
previous_latents_path = new_full_latents_path
|
| 252 |
|
| 253 |
latents_for_video = new_full_latents
|
| 254 |
-
|
| 255 |
-
# Aplicar cortes apenas onde necessário para evitar duplicação
|
| 256 |
if not is_first_fragment:
|
| 257 |
-
|
| 258 |
-
if
|
| 259 |
-
|
| 260 |
-
|
| 261 |
-
# Para todos os fragmentos exceto o último, remove sobreposição do final
|
| 262 |
-
is_last_fragment = (i == num_transitions_to_generate - 1)
|
| 263 |
-
if not is_last_fragment:
|
| 264 |
-
if n_trim_latents > 0 and latents_for_video.shape[2] > n_trim_latents:
|
| 265 |
-
latents_for_video = latents_for_video[:, :, :-n_trim_latents, :, :]
|
| 266 |
|
| 267 |
video_with_audio_path = self._generate_video_and_audio_from_latents(latents_for_video, audio_prompt, base_name)
|
| 268 |
video_clips_paths.append(video_with_audio_path)
|
| 269 |
-
|
| 270 |
-
|
| 271 |
if transition_type == "cut":
|
| 272 |
previous_latents_path = None
|
| 273 |
-
|
| 274 |
-
|
| 275 |
yield {"fragment_path": video_with_audio_path}
|
| 276 |
|
| 277 |
final_movie_path = os.path.join(self.workspace_dir, f"final_movie_{int(time.time())}.mp4")
|
| 278 |
self.concatenate_videos_ffmpeg(video_clips_paths, final_movie_path)
|
| 279 |
-
|
| 280 |
logger.info(f"Filme completo salvo em: {final_movie_path}")
|
| 281 |
yield {"final_path": final_movie_path}
|
| 282 |
|
|
|
|
| 79 |
video_np = (video_tensor.detach().cpu().float().numpy() * 255).astype(np.uint8)
|
| 80 |
with imageio.get_writer(path, fps=fps, codec='libx264', quality=8) as writer:
|
| 81 |
for frame in video_np: writer.append_data(frame)
|
| 82 |
+
logger.info(f"VÃdeo salvo em: {path}")
|
| 83 |
|
| 84 |
def _preprocess_image_for_latent_conversion(self, image: Image.Image, target_resolution: tuple) -> Image.Image:
|
| 85 |
if image.size != target_resolution:
|
| 86 |
+
logger.info(f" - AÇÃO: Redimensionando imagem de {image.size} para {target_resolution} antes da conversão para latente.")
|
| 87 |
return ImageOps.fit(image, target_resolution, Image.Resampling.LANCZOS)
|
| 88 |
return image
|
| 89 |
|
|
|
|
| 92 |
tensor = torch.from_numpy(image_np).permute(2, 0, 1).unsqueeze(0).unsqueeze(2)
|
| 93 |
tensor = (tensor * 2.0) - 1.0
|
| 94 |
return self.pixels_to_latents(tensor)
|
| 95 |
+
|
| 96 |
def _generate_video_and_audio_from_latents(self, latent_tensor, audio_prompt, base_name):
|
| 97 |
silent_video_path = os.path.join(self.workspace_dir, f"{base_name}_silent.mp4")
|
| 98 |
pixel_tensor = self.latents_to_pixels(latent_tensor)
|
| 99 |
self.save_video_from_tensor(pixel_tensor, silent_video_path, fps=24)
|
| 100 |
del pixel_tensor; gc.collect()
|
| 101 |
+
|
| 102 |
try:
|
| 103 |
result = subprocess.run(
|
| 104 |
["ffprobe", "-v", "error", "-show_entries", "format=duration", "-of", "default=noprint_wrappers=1:nokey=1", silent_video_path],
|
| 105 |
capture_output=True, text=True, check=True)
|
| 106 |
frag_duration = float(result.stdout.strip())
|
| 107 |
except (subprocess.CalledProcessError, ValueError, FileNotFoundError):
|
| 108 |
+
logger.warning(f"ffprobe falhou em {os.path.basename(silent_video_path)}. Calculando duração manualmente.")
|
| 109 |
num_pixel_frames = latent_tensor.shape[2] * 8
|
| 110 |
frag_duration = num_pixel_frames / 24.0
|
| 111 |
|
| 112 |
video_with_audio_path = audio_specialist_singleton.generate_audio_for_video(
|
| 113 |
video_path=silent_video_path, prompt=audio_prompt,
|
| 114 |
duration_seconds=frag_duration)
|
| 115 |
+
|
| 116 |
if os.path.exists(silent_video_path):
|
| 117 |
os.remove(silent_video_path)
|
| 118 |
return video_with_audio_path
|
|
|
|
| 130 |
|
| 131 |
def concatenate_videos_ffmpeg(self, video_paths: list[str], output_path: str) -> str:
|
| 132 |
if not video_paths:
|
| 133 |
+
raise gr.Error("Nenhum fragmento de vÃdeo para montar.")
|
| 134 |
list_file_path = os.path.join(self.workspace_dir, "concat_list.txt")
|
| 135 |
with open(list_file_path, 'w', encoding='utf-8') as f:
|
| 136 |
for path in video_paths:
|
| 137 |
f.write(f"file '{os.path.abspath(path)}'\n")
|
| 138 |
cmd_list = ['ffmpeg', '-y', '-f', 'concat', '-safe', '0', '-i', list_file_path, '-c', 'copy', output_path]
|
| 139 |
+
logger.info("Executando concatenação FFmpeg...")
|
| 140 |
try:
|
| 141 |
subprocess.run(cmd_list, check=True, capture_output=True, text=True)
|
| 142 |
except subprocess.CalledProcessError as e:
|
| 143 |
logger.error(f"Erro no FFmpeg: {e.stderr}")
|
| 144 |
+
raise gr.Error(f"Falha na montagem final do vÃdeo. Detalhes: {e.stderr}")
|
| 145 |
return output_path
|
| 146 |
+
|
| 147 |
def generate_full_movie(self,
|
| 148 |
keyframes: list,
|
| 149 |
global_prompt: str,
|
|
|
|
| 157 |
video_resolution: int,
|
| 158 |
use_continuity_director: bool,
|
| 159 |
progress: gr.Progress = gr.Progress()):
|
| 160 |
+
|
| 161 |
keyframe_paths = [item[0] if isinstance(item, tuple) else item for item in keyframes]
|
| 162 |
video_clips_paths, story_history, audio_history = [], "", "This is the beginning of the film."
|
| 163 |
target_resolution_tuple = (video_resolution, video_resolution)
|
| 164 |
n_trim_latents = self._quantize_to_multiple(int(seconds_per_fragment * 24 * (overlap_percent / 100.0)), 8)
|
| 165 |
#echo_frames = 8
|
| 166 |
+
|
| 167 |
previous_latents_path = None
|
| 168 |
num_transitions_to_generate = len(keyframe_paths) - 1
|
| 169 |
+
|
| 170 |
for i in range(num_transitions_to_generate):
|
| 171 |
+
progress((i + 1) / num_transitions_to_generate, desc=f"Produzindo Transição {i+1}/{num_transitions_to_generate}")
|
| 172 |
|
| 173 |
start_keyframe_path = keyframe_paths[i]
|
| 174 |
destination_keyframe_path = keyframe_paths[i+1]
|
|
|
|
| 191 |
present_scene_desc=present_scene_desc, future_scene_desc=future_scene_desc
|
| 192 |
)
|
| 193 |
transition_type, motion_prompt = decision["transition_type"], decision["motion_prompt"]
|
| 194 |
+
|
| 195 |
story_history += f"\n- Ato {i+1} ({transition_type}): {motion_prompt}"
|
| 196 |
|
| 197 |
+
if use_continuity_director: # Assume-se que este checkbox controla os diretores de vÃdeo e som
|
| 198 |
if is_first_fragment:
|
| 199 |
audio_prompt = gemini_singleton.get_sound_director_prompt(
|
| 200 |
audio_history=audio_history,
|
|
|
|
| 211 |
)
|
| 212 |
else:
|
| 213 |
audio_prompt = present_scene_desc # Fallback para o prompt da cena se o diretor de som estiver desligado
|
| 214 |
+
|
| 215 |
audio_history = audio_prompt
|
| 216 |
|
| 217 |
conditioning_items = []
|
|
|
|
| 232 |
trimmed_for_echo = previous_latents[:, :, :-n_trim_latents, :, :] if n_trim_latents > 0 and previous_latents.shape[2] > n_trim_latents else previous_latents
|
| 233 |
echo_latents = trimmed_for_echo[:, :, -echo_frames:, :, :]
|
| 234 |
handler_frame_position = n_trim_latents + echo_frames
|
| 235 |
+
|
| 236 |
+
|
| 237 |
conditioning_items.append(LatentConditioningItem(echo_latents, 0, 1.0))
|
| 238 |
conditioning_items.append(LatentConditioningItem(handler_latent, handler_frame_position, handler_strength))
|
| 239 |
del previous_latents, handler_latent, trimmed_for_echo, echo_latents; gc.collect()
|
|
|
|
| 241 |
img_dest = self._preprocess_image_for_latent_conversion(Image.open(destination_keyframe_path).convert("RGB"), target_resolution_tuple)
|
| 242 |
destination_latent = self.pil_to_latent(img_dest)
|
| 243 |
conditioning_items.append(LatentConditioningItem(destination_latent, total_frames_to_generate - 1, destination_convergence_strength))
|
| 244 |
+
|
| 245 |
new_full_latents = self._generate_latent_tensor_internal(conditioning_items, current_ltx_params, target_resolution_tuple, total_frames_to_generate)
|
| 246 |
+
|
| 247 |
base_name = f"fragment_{i}_{int(time.time())}"
|
| 248 |
new_full_latents_path = os.path.join(self.workspace_dir, f"{base_name}_full.pt")
|
| 249 |
self.save_latent_tensor(new_full_latents, new_full_latents_path)
|
| 250 |
+
|
| 251 |
previous_latents_path = new_full_latents_path
|
| 252 |
|
| 253 |
latents_for_video = new_full_latents
|
| 254 |
+
|
|
|
|
| 255 |
if not is_first_fragment:
|
| 256 |
+
if echo_frames > 0 and latents_for_video.shape[2] > echo_frames: latents_for_video = latents_for_video[:, :, echo_frames:, :, :]
|
| 257 |
+
if n_trim_latents > 0 and latents_for_video.shape[2] > n_trim_latents: latents_for_video = latents_for_video[:, :, :-n_trim_latents, :, :]
|
| 258 |
+
else:
|
| 259 |
+
if n_trim_latents > 0 and latents_for_video.shape[2] > n_trim_latents: latents_for_video = latents_for_video[:, :, :-n_trim_latents, :, :]
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 260 |
|
| 261 |
video_with_audio_path = self._generate_video_and_audio_from_latents(latents_for_video, audio_prompt, base_name)
|
| 262 |
video_clips_paths.append(video_with_audio_path)
|
| 263 |
+
|
| 264 |
+
|
| 265 |
if transition_type == "cut":
|
| 266 |
previous_latents_path = None
|
| 267 |
+
|
| 268 |
+
|
| 269 |
yield {"fragment_path": video_with_audio_path}
|
| 270 |
|
| 271 |
final_movie_path = os.path.join(self.workspace_dir, f"final_movie_{int(time.time())}.mp4")
|
| 272 |
self.concatenate_videos_ffmpeg(video_clips_paths, final_movie_path)
|
| 273 |
+
|
| 274 |
logger.info(f"Filme completo salvo em: {final_movie_path}")
|
| 275 |
yield {"final_path": final_movie_path}
|
| 276 |
|