Aduc_sdr

Paused

App Files Files Community

euiia commited on Sep 2

Commit

9cdf9d7

verified ·

1 Parent(s): 63ceaa5

Update deformes4D_engine.py

Browse files

Files changed (1) hide show

deformes4D_engine.py +124 -133

deformes4D_engine.py CHANGED Viewed

@@ -26,6 +26,7 @@ from audio_specialist import audio_specialist_singleton
 from ltx_manager_helpers import ltx_manager_singleton
 from gemini_helpers import gemini_singleton
 from upscaler_specialist import upscaler_specialist_singleton
 from ltx_video.models.autoencoders.causal_video_autoencoder import CausalVideoAutoencoder
 from ltx_video.models.autoencoders.vae_encode import vae_encode, vae_decode
@@ -80,7 +81,7 @@ class Deformes4DEngine:
         video_tensor = video_tensor.squeeze(0).permute(1, 2, 3, 0)
         video_tensor = (video_tensor.clamp(-1, 1) + 1) / 2.0
         video_np = (video_tensor.detach().cpu().float().numpy() * 255).astype(np.uint8)
-        with imageio.get_writer(path, fps=fps, codec='libx264', quality=8) as writer:
             for frame in video_np: writer.append_data(frame)
     def _preprocess_image_for_latent_conversion(self, image: Image.Image, target_resolution: tuple) -> Image.Image:
@@ -94,56 +95,48 @@ class Deformes4DEngine:
         tensor = (tensor * 2.0) - 1.0
         return self.pixels_to_latents(tensor)
-    def _get_video_frame_count(self, video_path: str) -> int | None:
-        if not os.path.exists(video_path): return None
-        cmd = ['ffprobe', '-v', 'error', '-select_streams', 'v:0', '-count_frames',
-               '-show_entries', 'stream=nb_read_frames', '-of', 'default=nokey=1:noprint_wrappers=1', video_path]
-        try:
-            result = subprocess.run(cmd, check=True, capture_output=True, text=True, encoding='utf-8')
-            return int(result.stdout.strip())
-        except Exception: return None
-    def _trim_last_frame_ffmpeg(self, input_path: str, output_path: str) -> bool:
-        frame_count = self._get_video_frame_count(input_path)
-        if frame_count is None or frame_count < 2:
-            if os.path.exists(input_path): os.rename(input_path, output_path)
-            return True
-        vf_filter = f"select='lt(n,{frame_count - 1})',setpts=PTS-STARTPTS"
-        cmd_list = ['ffmpeg', '-y', '-i', input_path, '-vf', vf_filter, '-an', output_path]
-        try:
-            subprocess.run(cmd_list, check=True, capture_output=True, text=True, encoding='utf-8')
-            return True
-        except subprocess.CalledProcessError: return False
-    def concatenate_videos_ffmpeg(self, video_paths: list[str], output_path: str) -> str:
-        if not video_paths: raise gr.Error("Nenhum fragmento de vídeo para montar.")
-        list_file_path = os.path.join(self.workspace_dir, "concat_list.txt")
-        with open(list_file_path, 'w', encoding='utf-8') as f:
-            for path in video_paths: f.write(f"file '{os.path.abspath(path)}'\n")
-        cmd_list = ['ffmpeg', '-y', '-f', 'concat', '-safe', '0', '-i', list_file_path, '-c', 'copy', output_path]
-        try:
-            subprocess.run(cmd_list, check=True, capture_output=True, text=True)
-        except subprocess.CalledProcessError as e:
-            raise gr.Error(f"Falha na montagem final do vídeo. Detalhes: {e.stderr}")
-        return output_path
-    def _generate_video_and_audio(self, silent_video_path: str, audio_prompt: str, base_name: str) -> str:
         try:
             result = subprocess.run(
-                ["ffprobe", "-v", "error", "-show_entries", "format=duration", "-of", "default=noprint_wrappers=1:nokey=1", silent_video_path],
                 capture_output=True, text=True, check=True)
-            duration = float(result.stdout.strip())
         except Exception:
-            frame_count = self._get_video_frame_count(silent_video_path)
-            duration = (frame_count / 24.0) if frame_count else 0
-        video_with_audio_path = audio_specialist_singleton.generate_audio_for_video(
-            video_path=silent_video_path, prompt=audio_prompt,
-            duration_seconds=duration)
-        return video_with_audio_path
     # NÚCLEO DA LÓGICA ADUC-SDR
     def generate_full_movie(self, keyframes: list, global_prompt: str, storyboard: list,
@@ -152,6 +145,9 @@ class Deformes4DEngine:
                             video_resolution: int, use_continuity_director: bool,
                             progress: gr.Progress = gr.Progress()):
         FPS = 24
         FRAMES_PER_LATENT_CHUNK = 8
         ECO_LATENT_CHUNKS = 2
@@ -171,18 +167,18 @@ class Deformes4DEngine:
         keyframe_paths = [item[0] if isinstance(item, tuple) else item for item in keyframes]
         story_history = ""
         eco_latent_for_next_loop = None
         dejavu_latent_for_next_loop = None
         num_transitions_to_generate = len(keyframe_paths) - 1
-        upscaled_latent_fragments = []
         for i in range(num_transitions_to_generate):
             fragment_index = i + 1
-            progress(i / (num_transitions_to_generate + 2), desc=f"Gerando Latentes do Fragmento {fragment_index}")
             past_keyframe_path = keyframe_paths[i - 1] if i > 0 else keyframe_paths[i]
             start_keyframe_path = keyframe_paths[i]
             destination_keyframe_path = keyframe_paths[i + 1]
@@ -193,16 +189,12 @@ class Deformes4DEngine:
             transition_type, motion_prompt = decision["transition_type"], decision["motion_prompt"]
             story_history += f"\n- Ato {fragment_index}: {motion_prompt}"
             expected_height, expected_width = 768, 1152
             downscale_factor = 2 / 3
             downscaled_height = self._quantize_to_multiple(int(expected_height * downscale_factor), 8)
             downscaled_width = self._quantize_to_multiple(int(expected_width * downscale_factor), 8)
             target_resolution_tuple = (downscaled_height, downscaled_width)
-            final_resolution_tuple = (expected_height, expected_width)
             conditioning_items = []
             if eco_latent_for_next_loop is None:
                img_start = self._preprocess_image_for_latent_conversion(Image.open(start_keyframe_path).convert("RGB"), target_resolution_tuple)
@@ -225,106 +217,105 @@ class Deformes4DEngine:
             if transition_type == "cut":
                 eco_latent_for_next_loop, dejavu_latent_for_next_loop = None, None
-            latent_upscale_fragment = self.upscale_latents(latents_video)
-            list_latents_fragments.append(latent_upscale_fragment)
-        progress((num_transitions_to_generate) / (num_transitions_to_generate + 2), desc="Concatenando latentes...")
-        tensors_para_concatenar = []
-        target_device = self.device
-        for idx, tensor_frag in enumerate(upscaled_latent_fragments):
-            tensor_on_target_device = tensor_frag.to(target_device)
-            if idx < len(upscaled_latent_fragments) - 1:
-                tensors_para_concatenar.append(tensor_on_target_device[:, :, :-1, :, :])
-            else:
-                tensors_para_concatenar.append(tensor_on_target_device)
         final_concatenated_latents = torch.cat(tensors_para_concatenar, dim=2)
-        progress((num_transitions_to_generate + 1) / (num_transitions_to_generate + 2), desc="Pós-produção (Upscale e Refinamento)...")
-        base_name = f"final_movie_hq_{int(time.time())}"
-        # Pós-produção: Upscale + Refine
-        #high_quality_video_path = self._render_and_post_process(
-        #    final_concatenated_latents,
-        #    base_name=base_name,
-        #    expected_height=720,
-        #    expected_width=720,
-        #    fps=24
-        #)
-        video_path = os.path.join(self.workspace_dir, f"{base_name}_HQ.mp4")
-        final_pixel_tensor = self.latents_to_pixels(final_concatenated_latents)
-        self.save_video_from_tensor(final_pixel_tensor, video_path, fps=24)
-        logger.info(f"Vídeo final salvo em: {video_path}")
-        #progress((num_transitions_to_generate + 1.5) / (num_transitions_to_generate + 2), desc="Gerando paisagem sonora...")
-        #video_with_audio_path = self._generate_video_and_audio(
-        #    silent_video_path=final_concatenated_latents,
-        #    audio_prompt=global_prompt,
-        #    base_name=base_name
-        #)
-        yield {"final_path": video_path}
     def refine_latents(self, latents: torch.Tensor,
                        fps: int = 24,
                        denoise_strength: float = 0.35,
                        refine_steps: int = 12,
                        motion_prompt: str = "refining video, improving details, cinematic quality") -> torch.Tensor:
-        """
-        Aplica um passe de refinamento (denoise) em um tensor latente.
-        """
-        logger.info(f"Recebido tensor latente com shape {latents.shape} para refinamento.")
-        # Extrai as dimensões do tensor latente.
         _, _, num_frames, latent_h, latent_w = latents.shape
-        # Obtém o fator de escala do VAE (geralmente 8).
         vae_scale_factor = self.vae.config.scaling_factor if hasattr(self.vae.config, 'scaling_factor') else 8
-        # Calcula as dimensões em PIXELS correspondentes.
-        pixel_height = latent_h * vae_scale_factor
-        pixel_width = latent_w * vae_scale_factor
-        # Chama o ltx_manager com os parâmetros corretos para evitar o AssertionError.
         refined_latents_tensor, _ = self.ltx_manager.refine_latents(
-            latents,
-            height=pixel_height,
-            width=pixel_width,
-            video_total_frames=num_frames,
-            video_fps=fps,
-            motion_prompt=motion_prompt,
-            current_fragment_index=int(time.time()),
-            denoise_strength=denoise_strength,
-            refine_steps=refine_steps
-        )
-        logger.info(f"Retornando tensor latente refinado com shape: {refined_latents_tensor.shape}")
         return refined_latents_tensor
     def upscale_latents(self, latents: torch.Tensor) -> torch.Tensor:
-        """
-        Recebe um tensor latente de baixa resolução e retorna a versão 2x upscaled.
-        Este método atua como uma interface para o UpscalerSpecialist.
-        """
-        logger.info(f"Recebido tensor latente com shape {latents.shape} para upscale.")
-        upscaled_latents = upscaler_specialist_singleton.upscale(latents)
-        logger.info(f"Retornando tensor latente upscaled com novo shape: {upscaled_latents.shape}")
-        return upscaled_latents
     def _generate_latent_tensor_internal(self, conditioning_items, ltx_params, target_resolution, total_frames_to_generate):
         kwargs = {
             **ltx_params, 'width': target_resolution[1], 'height': target_resolution[0],

 from ltx_manager_helpers import ltx_manager_singleton
 from gemini_helpers import gemini_singleton
 from upscaler_specialist import upscaler_specialist_singleton
+from hd_specialist import hd_specialist_singleton # Importa o novo especialista
 from ltx_video.models.autoencoders.causal_video_autoencoder import CausalVideoAutoencoder
 from ltx_video.models.autoencoders.vae_encode import vae_encode, vae_decode
         video_tensor = video_tensor.squeeze(0).permute(1, 2, 3, 0)
         video_tensor = (video_tensor.clamp(-1, 1) + 1) / 2.0
         video_np = (video_tensor.detach().cpu().float().numpy() * 255).astype(np.uint8)
+        with imageio.get_writer(path, fps=fps, codec='libx264', quality=8, output_params=['-pix_fmt', 'yuv420p']) as writer:
             for frame in video_np: writer.append_data(frame)
     def _preprocess_image_for_latent_conversion(self, image: Image.Image, target_resolution: tuple) -> Image.Image:
         tensor = (tensor * 2.0) - 1.0
         return self.pixels_to_latents(tensor)
+    def _get_video_duration(self, video_path: str) -> float:
+        if not os.path.exists(video_path): return 0.0
         try:
             result = subprocess.run(
+                ["ffprobe", "-v", "error", "-show_entries", "format=duration", "-of", "default=noprint_wrappers=1:nokey=1", video_path],
                 capture_output=True, text=True, check=True)
+            return float(result.stdout.strip())
         except Exception:
+            return 0.0
+    def _combine_video_and_audio_ffmpeg(self, video_path: str, audio_path: str, output_path: str):
+        """Combina um arquivo de vídeo com um arquivo de áudio usando ffmpeg."""
+        cmd = [
+            'ffmpeg', '-y',
+            '-i', video_path,
+            '-i', audio_path,
+            '-c:v', 'copy',      # Copia o stream de vídeo sem re-codificar
+            '-c:a', 'aac',       # Re-codifica o áudio para o formato AAC, padrão para MP4
+            '-shortest',         # Termina a codificação quando o stream mais curto terminar
+            output_path
+        ]
+        try:
+            subprocess.run(cmd, check=True, capture_output=True, text=True, encoding='utf-8')
+            logger.info(f"Áudio e vídeo combinados com sucesso em {output_path}")
+        except subprocess.CalledProcessError as e:
+            logger.error(f"Falha ao combinar áudio e vídeo. Detalhes: {e.stderr}")
+            raise gr.Error(f"Falha ao combinar áudio e vídeo: {e.stderr}")
+    def _generate_standalone_audio(self, video_for_duration_path: str, audio_prompt: str) -> str:
+        """Gera um arquivo de áudio e retorna seu caminho."""
+        duration = self._get_video_duration(video_for_duration_path)
+        if duration == 0:
+            raise gr.Error("Não foi possível determinar a duração do vídeo para gerar o áudio.")
+        # Esta função agora deve retornar apenas o caminho do arquivo de áudio gerado
+        # (pode exigir uma pequena modificação no seu audio_specialist)
+        audio_path = audio_specialist_singleton.generate_audio(
+            prompt=audio_prompt,
+            duration_seconds=duration,
+            output_dir=self.workspace_dir
+        )
+        return audio_path
     # NÚCLEO DA LÓGICA ADUC-SDR
     def generate_full_movie(self, keyframes: list, global_prompt: str, storyboard: list,
                             video_resolution: int, use_continuity_director: bool,
                             progress: gr.Progress = gr.Progress()):
+        TOTAL_STEPS = len(keyframes) - 1 + 5 # Fragmentos + 5 etapas de pós-produção
+        current_step = 0
         FPS = 24
         FRAMES_PER_LATENT_CHUNK = 8
         ECO_LATENT_CHUNKS = 2
         keyframe_paths = [item[0] if isinstance(item, tuple) else item for item in keyframes]
         story_history = ""
         eco_latent_for_next_loop = None
         dejavu_latent_for_next_loop = None
         num_transitions_to_generate = len(keyframe_paths) - 1
+        processed_latent_fragments = []
         for i in range(num_transitions_to_generate):
             fragment_index = i + 1
+            current_step += 1
+            progress(current_step / TOTAL_STEPS, desc=f"Gerando Fragmento {fragment_index}/{num_transitions_to_generate}")
+            # ... (Lógica de decisão do Gemini e configuração de parâmetros - sem alterações)
             past_keyframe_path = keyframe_paths[i - 1] if i > 0 else keyframe_paths[i]
             start_keyframe_path = keyframe_paths[i]
             destination_keyframe_path = keyframe_paths[i + 1]
             transition_type, motion_prompt = decision["transition_type"], decision["motion_prompt"]
             story_history += f"\n- Ato {fragment_index}: {motion_prompt}"
             expected_height, expected_width = 768, 1152
             downscale_factor = 2 / 3
             downscaled_height = self._quantize_to_multiple(int(expected_height * downscale_factor), 8)
             downscaled_width = self._quantize_to_multiple(int(expected_width * downscale_factor), 8)
             target_resolution_tuple = (downscaled_height, downscaled_width)
             conditioning_items = []
             if eco_latent_for_next_loop is None:
                img_start = self._preprocess_image_for_latent_conversion(Image.open(start_keyframe_path).convert("RGB"), target_resolution_tuple)
             if transition_type == "cut":
                 eco_latent_for_next_loop, dejavu_latent_for_next_loop = None, None
+            # --- ATO I: PÓS-PRODUÇÃO LATENTE ---
+            upscaled_latents = self.upscale_latents(latents_video)
+            refined_latents = self.refine_latents(upscaled_latents, motion_prompt=f"refining scene: {motion_prompt}")
+            processed_latent_fragments.append(refined_latents)
+        # --- FIM DO LOOP DE GERAÇÃO ---
+        current_step += 1
+        progress(current_step / TOTAL_STEPS, desc="Concatenando fragmentos...")
+        tensors_para_concatenar = [frag.to(self.device) for frag in processed_latent_fragments]
         final_concatenated_latents = torch.cat(tensors_para_concatenar, dim=2)
+        base_name = f"movie_{int(time.time())}"
+        current_step += 1
+        progress(current_step / TOTAL_STEPS, desc="Renderizando vídeo base...")
+        refined_silent_video_path = os.path.join(self.workspace_dir, f"{base_name}_refined_silent.mp4")
+        final_pixel_tensor = self.latents_to_pixels(final_concatenated_latents)
+        self.save_video_from_tensor(final_pixel_tensor, refined_silent_video_path, fps=FPS)
+        # Limpeza de VRAM antes da próxima etapa pesada
+        del final_pixel_tensor, final_concatenated_latents, processed_latent_fragments, tensors_para_concatenar
+        gc.collect()
+        torch.cuda.empty_cache()
+        # --- ATO II: PRIMEIRA MASTERIZAÇÃO (VÍDEO + MÚSICA) ---
+        current_step += 1
+        progress(current_step / TOTAL_STEPS, desc="Gerando trilha sonora...")
+        try:
+            # Assume que seu audio_specialist tem um método que retorna o caminho do áudio
+            audio_path = audio_specialist_singleton.generate_standalone_audio(
+                prompt=global_prompt,
+                duration_seconds=self._get_video_duration(refined_silent_video_path),
+                output_dir=self.workspace_dir
+            )
+            refined_video_with_audio_path = os.path.join(self.workspace_dir, f"{base_name}_refined_with_audio.mp4")
+            self._combine_video_and_audio_ffmpeg(refined_silent_video_path, audio_path, refined_video_with_audio_path)
+            logger.info(f"Primeira masterização com áudio salva em: {refined_video_with_audio_path}")
+            # Você pode opcionalmente retornar este vídeo aqui como uma prévia
+            # yield {"preview_path": refined_video_with_audio_path}
+        except Exception as e:
+            logger.error(f"Falha na geração ou combinação de áudio: {e}. Prosseguindo sem áudio.")
+            audio_path = None # Garante que a variável exista
+            refined_video_with_audio_path = refined_silent_video_path # Usa o vídeo silencioso como fallback
+        # --- ATO III: MASTERIZAÇÃO FINAL (APLICAÇÃO DE HD) ---
+        current_step += 1
+        progress(current_step / TOTAL_STEPS, desc="Aprimoramento final (HD)...")
+        hq_silent_video_path = os.path.join(self.workspace_dir, f"{base_name}_hq_silent.mp4")
+        try:
+            # O Especialista HD processa o vídeo silencioso refinado
+            hd_specialist_singleton.process_video(
+                input_video_path=refined_silent_video_path,
+                output_video_path=hq_silent_video_path,
+                prompt=global_prompt
+            )
+        except Exception as e:
+            logger.error(f"Falha no processo de aprimoramento HD. Usando o vídeo refinado como fallback. Erro: {e}")
+            # Se o HD falhar, usamos o vídeo refinado (silencioso) como base para o final
+            os.rename(refined_silent_video_path, hq_silent_video_path)
+        current_step += 1
+        progress(current_step / TOTAL_STEPS, desc="Finalizando montagem...")
+        final_video_path = os.path.join(self.workspace_dir, f"{base_name}_FINAL.mp4")
+        #if audio_path and os.path.exists(audio_path):
+        #    # Se o áudio foi gerado, combina o vídeo de ALTA QUALIDADE com ele
+        #    self._combine_video_and_audio_ffmpeg(hq_silent_video_path, audio_path, final_video_path)
+        #else:
+        #    # Se não houver áudio, apenas renomeia o vídeo de alta qualidade
+        #    os.rename(hq_silent_video_path, final_video_path)
+        logger.info(f"Processo concluído! Vídeo final salvo em: {hq_silent_video_path}")
+        yield {"final_path": final_video_path}
     def refine_latents(self, latents: torch.Tensor,
                        fps: int = 24,
                        denoise_strength: float = 0.35,
                        refine_steps: int = 12,
                        motion_prompt: str = "refining video, improving details, cinematic quality") -> torch.Tensor:
+        """Aplica um passe de refinamento (denoise) em um tensor latente."""
+        logger.info(f"Refinando tensor latente com shape {latents.shape}.")
         _, _, num_frames, latent_h, latent_w = latents.shape
         vae_scale_factor = self.vae.config.scaling_factor if hasattr(self.vae.config, 'scaling_factor') else 8
+        pixel_height, pixel_width = latent_h * vae_scale_factor, latent_w * vae_scale_factor
         refined_latents_tensor, _ = self.ltx_manager.refine_latents(
+            latents, height=pixel_height, width=pixel_width, video_total_frames=num_frames,
+            video_fps=fps, motion_prompt=motion_prompt, current_fragment_index=int(time.time()),
+            denoise_strength=denoise_strength, refine_steps=refine_steps)
         return refined_latents_tensor
     def upscale_latents(self, latents: torch.Tensor) -> torch.Tensor:
+        """Interface para o UpscalerSpecialist."""
+        logger.info(f"Realizando upscale em tensor latente com shape {latents.shape}.")
+        return upscaler_specialist_singleton.upscale(latents)
     def _generate_latent_tensor_internal(self, conditioning_items, ltx_params, target_resolution, total_frames_to_generate):
         kwargs = {
             **ltx_params, 'width': target_resolution[1], 'height': target_resolution[0],