Test3

Paused

App Files Files Community

EuuIia commited on Oct 6

Commit

953982d

verified ·

1 Parent(s): 82b2143

Update api/ltx_server.py

Browse files

Files changed (1) hide show

api/ltx_server.py +137 -181

api/ltx_server.py CHANGED Viewed

@@ -42,6 +42,8 @@ import shutil
 import contextlib
 import time
 import traceback
 # Singletons (versões simples)
 from managers.vae_manager import vae_manager_singleton
@@ -157,6 +159,9 @@ add_deps_to_path()
 from ltx_video.pipelines.pipeline_ltx_video import ConditioningItem, LTXMultiScalePipeline
 from ltx_video.utils.skip_layer_strategy import SkipLayerStrategy
 # --- 4. FUNÇÕES HELPER DE LOG ---
 def log_tensor_info(tensor, name="Tensor"):
@@ -174,6 +179,28 @@ def log_tensor_info(tensor, name="Tensor"):
             pass
     print("------------------------------------------\n")
 # --- 5. CLASSE PRINCIPAL DO SERVIÇO ---
 class VideoService:
     def __init__(self):
@@ -441,7 +468,7 @@ class VideoService:
             chunks.append(latents_brutos)
         print("================PODA CAUSAL=================")
         return chunks
     def _get_total_frames(self, video_path: str) -> int:
         cmd = [
             "ffprobe",
@@ -455,8 +482,6 @@ class VideoService:
         result = subprocess.run(cmd, capture_output=True, text=True, check=True)
         return int(result.stdout.strip())
     def _gerar_lista_com_transicoes(self, pasta: str, video_paths: list[str], crossfade_frames: int = 8) -> list[str]:
         """
         Gera uma nova lista de vídeos aplicando transições suaves (blend frame a frame)
@@ -538,7 +563,7 @@ class VideoService:
         print("===========CONCATECAO CAUSAL=============")
         print(f"[DEBUG] {nova_lista}")
         return nova_lista
     def _concat_mp4s_no_reencode(self, mp4_list: List[str], out_path: str):
         """
         Concatena múltiplos MP4s sem reencode usando o demuxer do ffmpeg.
@@ -564,8 +589,11 @@ class VideoService:
                 os.remove(list_path)
             except Exception:
                 pass
     def generate(
         self,
         prompt,
@@ -587,7 +615,6 @@ class VideoService:
         guidance_scale=3.0,
         improve_texture=True,
         progress_callback=None,
-        # Sempre latent → VAE → MP4 (simples)
         external_decode=True,
     ):
         t_all = time.perf_counter()
@@ -596,211 +623,140 @@ class VideoService:
             torch.cuda.empty_cache(); torch.cuda.reset_peak_memory_stats()
         self._log_gpu_memory("Início da Geração")
         if mode == "image-to-video" and not start_image_filepath:
             raise ValueError("A imagem de início é obrigatória para o modo image-to-video")
-        if mode == "video-to-video" and not input_video_filepath:
-            raise ValueError("O vídeo de entrada é obrigatório para o modo video-to-video")
         used_seed = random.randint(0, 2**32 - 1) if randomize_seed else int(seed)
         seed_everething(used_seed); print(f"[DEBUG] Seed usado: {used_seed}")
         FPS = 24.0; MAX_NUM_FRAMES = 2570
         target_frames_rounded = round(duration * FPS)
         n_val = round((float(target_frames_rounded) - 1.0) / 8.0)
         actual_num_frames = max(9, min(MAX_NUM_FRAMES, int(n_val * 8 + 1)))
-        print(f"[DEBUG] Frames alvo: {actual_num_frames} (dur={duration}s @ {FPS}fps)")
         height_padded = ((height - 1) // 32 + 1) * 32
         width_padded = ((width - 1) // 32 + 1) * 32
         padding_values = calculate_padding(height, width, height_padded, width_padded)
-        print(f"[DEBUG] Dimensões: ({height},{width}) -> pad ({height_padded},{width_padded}); padding={padding_values}")
         generator = torch.Generator(device=self.device).manual_seed(used_seed)
         conditioning_items = []
         if mode == "image-to-video":
-            start_tensor = self._prepare_conditioning_tensor(start_image_filepath, height, width, padding_values)
-            conditioning_items.append(ConditioningItem(start_tensor, 0, 1.0))
-            if middle_image_filepath and middle_frame_number is not None:
-                middle_tensor = self._prepare_conditioning_tensor(middle_image_filepath, height, width, padding_values)
-                safe_middle_frame = max(0, min(int(middle_frame_number), actual_num_frames - 1))
-                conditioning_items.append(ConditioningItem(middle_tensor, safe_middle_frame, float(middle_image_weight)))
-            if end_image_filepath:
-                end_tensor = self._prepare_conditioning_tensor(end_image_filepath, height, width, padding_values)
-                last_frame_index = actual_num_frames - 1
-                conditioning_items.append(ConditioningItem(end_tensor, last_frame_index, float(end_image_weight)))
-            print(f"[DEBUG] Conditioning items: {len(conditioning_items)}")
-        # Sempre pedimos latentes (simples)
         call_kwargs = {
-            "prompt": prompt,
-            "negative_prompt": negative_prompt,
-            "height": height_padded,
-            "width": width_padded,
-            "num_frames": actual_num_frames,
-            "frame_rate": int(FPS),
-            "generator": generator,
-            "output_type": "latent",
             "conditioning_items": conditioning_items if conditioning_items else None,
-            "media_items": None,
-            "decode_timestep": self.config["decode_timestep"],
-            "decode_noise_scale": self.config["decode_noise_scale"],
-            "stochastic_sampling": self.config["stochastic_sampling"],
-            "image_cond_noise_scale": 0.01,
-            "is_video": True,
-            "vae_per_channel_normalize": True,
-            "mixed_precision": (self.config["precision"] == "mixed_precision"),
-            "offload_to_cpu": False,
-            "enhance_prompt": False,
-            "skip_layer_strategy": SkipLayerStrategy.AttentionValues,
         }
-        print(f"[DEBUG] output_type={call_kwargs['output_type']} skip_layer_strategy={call_kwargs['skip_layer_strategy']}")
-        if mode == "video-to-video":
-            media = load_media_file(
-                media_path=input_video_filepath,
-                height=height,
-                width=width,
-                max_frames=int(frames_to_use),
-                padding=padding_values,
-            ).to(self.device)
-            call_kwargs["media_items"] = media
-            print(f"[DEBUG] media_items shape={tuple(media.shape)}")
         latents = None
-        multi_scale_pipeline = None
         try:
-            if improve_texture:
-                if not self.latent_upsampler:
-                    raise ValueError("Upscaler espacial não carregado.")
-                print("[DEBUG] Multi-escala: construindo pipeline...")
-                multi_scale_pipeline = LTXMultiScalePipeline(self.pipeline, self.latent_upsampler)
-                first_pass_args = self.config.get("first_pass", {}).copy()
-                first_pass_args["guidance_scale"] = float(guidance_scale)
-                second_pass_args = self.config.get("second_pass", {}).copy()
-                second_pass_args["guidance_scale"] = float(guidance_scale)
-                multi_scale_call_kwargs = call_kwargs.copy()
-                multi_scale_call_kwargs.update(
-                    {
-                        "downscale_factor": self.config["downscale_factor"],
-                        "first_pass": first_pass_args,
-                        "second_pass": second_pass_args,
-                    }
-                )
-                print("[DEBUG] Chamando multi_scale_pipeline...")
-                t_ms = time.perf_counter()
-                ctx = torch.autocast(device_type="cuda", dtype=self.runtime_autocast_dtype) if self.device == "cuda" else contextlib.nullcontext()
-                with ctx:
-                    result = multi_scale_pipeline(**multi_scale_call_kwargs)
-                print(f"[DEBUG] multi_scale_pipeline tempo={time.perf_counter()-t_ms:.3f}s")
-                if hasattr(result, "latents"):
-                    latents = result.latents
-                elif hasattr(result, "images") and isinstance(result.images, torch.Tensor):
-                    latents = result.images
-                else:
-                    latents = result
-                print(f"[DEBUG] Latentes (multi-escala): shape={tuple(latents.shape)}")
-            else:
-                single_pass_kwargs = call_kwargs.copy()
-                first_pass_config = self.config.get("first_pass", {})
-                single_pass_kwargs.update(
-                    {
                         "guidance_scale": float(guidance_scale),
-                        "stg_scale": first_pass_config.get("stg_scale"),
-                        "rescaling_scale": first_pass_config.get("rescaling_scale"),
-                        "skip_block_list": first_pass_config.get("skip_block_list"),
-                    }
-                )
-                schedule = first_pass_config.get("timesteps") or first_pass_config.get("guidance_timesteps")
-                if mode == "video-to-video":
-                    schedule = [0.7]; print("[INFO] Modo video-to-video (etapa única): timesteps=[0.7]")
-                if isinstance(schedule, (list, tuple)) and len(schedule) > 0:
-                    single_pass_kwargs["timesteps"] = schedule
-                    single_pass_kwargs["guidance_timesteps"] = schedule
-                print(f"[DEBUG] Single-pass: timesteps_len={len(schedule) if schedule else 0}")
-                print("\n[INFO] Executando pipeline de etapa única...")
-                t_sp = time.perf_counter()
-                ctx = torch.autocast(device_type="cuda", dtype=self.runtime_autocast_dtype) if self.device == "cuda" else contextlib.nullcontext()
-                with ctx:
-                    result = self.pipeline(**single_pass_kwargs)
-                print(f"[DEBUG] single-pass tempo={time.perf_counter()-t_sp:.3f}s")
-                if hasattr(result, "latents"):
-                    latents = result.latents
-                elif hasattr(result, "images") and isinstance(result.images, torch.Tensor):
-                    latents = result.images
-                else:
-                    latents = result
-                print(f"[DEBUG] Latentes (single-pass): shape={tuple(latents.shape)}")
-            # Staging e escrita MP4 (simples: VAE → pixels → MP4)
             latents_cpu = latents.detach().to("cpu", non_blocking=True)
-            torch.cuda.empty_cache()
-            try:
-                torch.cuda.ipc_collect()
-            except Exception:
-                pass
-            latents_parts = self._dividir_latentes_por_tamanho(latents_cpu,4,1)
             temp_dir = tempfile.mkdtemp(prefix="ltxv_"); self._register_tmp_dir(temp_dir)
             results_dir = "/app/output"; os.makedirs(results_dir, exist_ok=True)
-            partes_mp4 = []
-            par = 0
-            for latents in latents_parts:
-                print(f"[DEBUG] Partição {par}: {tuple(latents.shape)}")
-                par = par + 1
-                output_video_path = os.path.join(temp_dir, f"output_{used_seed}_{par}.mp4")
-                final_output_path = None
-                print("[DEBUG] Decodificando bloco de latentes com VAE → tensor de pixels...")
-                # Usar manager com timestep por item; previne target_shape e rota NoneType.decode
-                pixel_tensor = vae_manager_singleton.decode(
-                    latents.to(self.device, non_blocking=True),
-                    decode_timestep=float(self.config.get("decode_timestep", 0.05))
-                )
-                log_tensor_info(pixel_tensor, "Pixel tensor (VAE saída)")
-                print("[DEBUG] Codificando MP4 a partir do tensor de pixels (bloco inteiro)...")
-                video_encode_tool_singleton.save_video_from_tensor(
-                    pixel_tensor,
-                    output_video_path,
-                    fps=call_kwargs["frame_rate"],
-                    progress_callback=progress_callback
-                )
-                candidate = os.path.join(results_dir, f"output_par_{par}.mp4")
-                try:
-                    shutil.move(output_video_path, candidate)
-                    final_output_path = candidate
-                    print(f"[DEBUG] MP4 parte {par} movido para {final_output_path}")
-                    partes_mp4.append(final_output_path)
-                except Exception as e:
-                    final_output_path = output_video_path
-                    print(f"[DEBUG] Falha no move; usando tmp como final: {e}")
-            total_partes = len(partes_mp4)
-            if (total_partes>1):
-                final_vid = os.path.join(results_dir, f"concat_fim_{used_seed}.mp4")
-                partes_mp4_fade = self._gerar_lista_com_transicoes(pasta=results_dir, video_paths=partes_mp4, crossfade_frames=8)
-                self._concat_mp4s_no_reencode(partes_mp4_fade, final_vid)
             else:
-                final_vid = partes_mp4[0]
             self._log_gpu_memory("Fim da Geração")
-            return final_vid, used_seed
         except Exception as e:
             print("[DEBUG] EXCEÇÃO NA GERAÇÃO:")
             print("".join(traceback.format_exception(type(e), e, e.__traceback__)))
@@ -832,4 +788,4 @@ class VideoService:
                 print(f"[DEBUG] finalize() no finally falhou: {e}")
 print("Criando instância do VideoService. O carregamento do modelo começará agora...")
-video_generation_service = VideoService()

 import contextlib
 import time
 import traceback
+from einops import rearrange
+import torch.nn.functional as F
 # Singletons (versões simples)
 from managers.vae_manager import vae_manager_singleton
 from ltx_video.pipelines.pipeline_ltx_video import ConditioningItem, LTXMultiScalePipeline
 from ltx_video.utils.skip_layer_strategy import SkipLayerStrategy
+from ltx_video.models.autoencoders.vae_encode import un_normalize_latents, normalize_latents
+from ltx_video.pipelines.pipeline_ltx_video import adain_filter_latent
 # --- 4. FUNÇÕES HELPER DE LOG ---
 def log_tensor_info(tensor, name="Tensor"):
             pass
     print("------------------------------------------\n")
+    @torch.no_grad()
+    def _upsample_latents_internal(self, latents: torch.Tensor) -> torch.Tensor:
+        """
+        Lógica extraída diretamente da LTXMultiScalePipeline para upscale de latentes.
+        """
+        if not self.latent_upsampler:
+            raise ValueError("Latent Upsampler não está carregado.")
+        # Garante que os modelos estejam no dispositivo correto
+        self.latent_upsampler.to(self.device)
+        self.pipeline.vae.to(self.device)
+        print(f"[DEBUG-UPSAMPLE] Shape de entrada: {tuple(latents.shape)}")
+        latents = un_normalize_latents(latents, self.pipeline.vae, vae_per_channel_normalize=True)
+        upsampled_latents = self.latent_upsampler(latents)
+        upsampled_latents = normalize_latents(upsampled_latents, self.pipeline.vae, vae_per_channel_normalize=True)
+        print(f"[DEBUG-UPSAMPLE] Shape de saída: {tuple(upsampled_latents.shape)}")
+        return upsampled_latents
 # --- 5. CLASSE PRINCIPAL DO SERVIÇO ---
 class VideoService:
     def __init__(self):
             chunks.append(latents_brutos)
         print("================PODA CAUSAL=================")
         return chunks
     def _get_total_frames(self, video_path: str) -> int:
         cmd = [
             "ffprobe",
         result = subprocess.run(cmd, capture_output=True, text=True, check=True)
         return int(result.stdout.strip())
     def _gerar_lista_com_transicoes(self, pasta: str, video_paths: list[str], crossfade_frames: int = 8) -> list[str]:
         """
         Gera uma nova lista de vídeos aplicando transições suaves (blend frame a frame)
         print("===========CONCATECAO CAUSAL=============")
         print(f"[DEBUG] {nova_lista}")
         return nova_lista
     def _concat_mp4s_no_reencode(self, mp4_list: List[str], out_path: str):
         """
         Concatena múltiplos MP4s sem reencode usando o demuxer do ffmpeg.
                 os.remove(list_path)
             except Exception:
                 pass
+    # ==============================================================================
+    # --- FUNÇÃO GENERATE COMPLETA E ATUALIZADA ---
+    # ==============================================================================
     def generate(
         self,
         prompt,
         guidance_scale=3.0,
         improve_texture=True,
         progress_callback=None,
         external_decode=True,
     ):
         t_all = time.perf_counter()
             torch.cuda.empty_cache(); torch.cuda.reset_peak_memory_stats()
         self._log_gpu_memory("Início da Geração")
+        # --- Setup Inicial (como antes) ---
         if mode == "image-to-video" and not start_image_filepath:
             raise ValueError("A imagem de início é obrigatória para o modo image-to-video")
         used_seed = random.randint(0, 2**32 - 1) if randomize_seed else int(seed)
         seed_everething(used_seed); print(f"[DEBUG] Seed usado: {used_seed}")
         FPS = 24.0; MAX_NUM_FRAMES = 2570
         target_frames_rounded = round(duration * FPS)
         n_val = round((float(target_frames_rounded) - 1.0) / 8.0)
         actual_num_frames = max(9, min(MAX_NUM_FRAMES, int(n_val * 8 + 1)))
         height_padded = ((height - 1) // 32 + 1) * 32
         width_padded = ((width - 1) // 32 + 1) * 32
         padding_values = calculate_padding(height, width, height_padded, width_padded)
         generator = torch.Generator(device=self.device).manual_seed(used_seed)
         conditioning_items = []
         if mode == "image-to-video":
+            # ... (lógica de preparação de conditioning_items como antes)
         call_kwargs = {
+            "prompt": prompt, "negative_prompt": negative_prompt,
+            "height": height_padded, "width": width_padded, "num_frames": actual_num_frames,
+            "frame_rate": int(FPS), "generator": generator,
             "conditioning_items": conditioning_items if conditioning_items else None,
+            "media_items": None, # (Lógica para video-to-video omitida por clareza)
+            # ... (outros kwargs base como antes)
         }
         latents = None
         try:
+            ctx = torch.autocast(device_type="cuda", dtype=self.runtime_autocast_dtype) if self.device == "cuda" else contextlib.nullcontext()
+            with ctx:
+                if improve_texture:
+                    if not self.latent_upsampler:
+                        raise ValueError("Upscaler espacial não carregado, mas 'improve_texture' está ativo.")
+                    # --- ETAPA 1: GERAÇÃO BASE (FIRST PASS) ---
+                    print("\n--- INICIANDO ETAPA 1: GERAÇÃO BASE (FIRST PASS) ---")
+                    t_pass1 = time.perf_counter()
+                    first_pass_config = self.config.get("first_pass", {}).copy()
+                    downscale_factor = self.config.get("downscale_factor", 0.666)
+                    downscaled_width = int(width_padded * downscale_factor)
+                    downscaled_height = int(height_padded * downscale_factor)
+                    first_pass_kwargs = call_kwargs.copy()
+                    first_pass_kwargs.update({
+                        "output_type": "latent",
+                        "width": downscaled_width,
+                        "height": downscaled_height,
                         "guidance_scale": float(guidance_scale),
+                        **first_pass_config
+                    })
+                    print(f"[DEBUG] First Pass: Gerando em {downscaled_width}x{downscaled_height}...")
+                    base_latents = self.pipeline(**first_pass_kwargs).images
+                    log_tensor_info(base_latents, "Latentes Base (First Pass)")
+                    print(f"[DEBUG] First Pass concluída em {time.perf_counter() - t_pass1:.2f}s")
+                    # --- ETAPA 2: UPSCALE DOS LATENTES ---
+                    print("\n--- INICIANDO ETAPA 2: UPSCALE DOS LATENTES ---")
+                    t_upscale = time.perf_counter()
+                    upsampled_latents = self._upsample_latents_internal(base_latents)
+                    upsampled_latents = adain_filter_latent(latents=upsampled_latents, reference_latents=base_latents)
+                    log_tensor_info(upsampled_latents, "Latentes Pós-Upscale")
+                    print(f"[DEBUG] Upscale de Latentes concluído em {time.perf_counter() - t_upscale:.2f}s")
+                    del base_latents; gc.collect(); torch.cuda.empty_cache()
+                    # --- ETAPA 3: REFINAMENTO DE TEXTURA (SECOND PASS) ---
+                    print("\n--- INICIANDO ETAPA 3: REFINAMENTO DE TEXTURA (SECOND PASS) ---")
+                    t_pass2 = time.perf_counter()
+                    second_pass_config = self.config.get("second_pass", {}).copy()
+                    second_pass_kwargs = call_kwargs.copy()
+                    second_pass_kwargs.update({
+                        "output_type": "latent",
+                        "width": width_padded,
+                        "height": height_padded,
+                        "latents": upsampled_latents,
+                        "guidance_scale": float(guidance_scale),
+                        **second_pass_config
+                    })
+                    print(f"[DEBUG] Second Pass: Refinando em {width_padded}x{height_padded}...")
+                    final_latents = self.pipeline(**second_pass_kwargs).images
+                    log_tensor_info(final_latents, "Latentes Finais (Pós-Second Pass)")
+                    print(f"[DEBUG] Second Pass concluída em {time.perf_counter() - t_pass2:.2f}s")
+                    latents = final_latents
+                else: # Geração de etapa única
+                    print("\n--- INICIANDO GERAÇÃO DE ETAPA ÚNICA ---")
+                    t_single = time.perf_counter()
+                    single_pass_kwargs = call_kwargs.copy()
+                    single_pass_kwargs.update(self.config.get("first_pass", {}))
+                    single_pass_kwargs["guidance_scale"] = float(guidance_scale)
+                    single_pass_kwargs["output_type"] = "latent"
+                    latents = self.pipeline(**single_pass_kwargs).images
+                    log_tensor_info(latents, "Latentes Finais (Etapa Única)")
+                    print(f"[DEBUG] Etapa única concluída em {time.perf_counter() - t_single:.2f}s")
+            # --- ETAPA FINAL: DECODIFICAÇÃO E CODIFICAÇÃO MP4 ---
+            print("\n--- INICIANDO ETAPA FINAL: DECODIFICAÇÃO E MONTAGEM ---")
             latents_cpu = latents.detach().to("cpu", non_blocking=True)
+            del latents; gc.collect(); torch.cuda.empty_cache()
             temp_dir = tempfile.mkdtemp(prefix="ltxv_"); self._register_tmp_dir(temp_dir)
             results_dir = "/app/output"; os.makedirs(results_dir, exist_ok=True)
+            # (A lógica de divisão de latentes e concatenação com fade que você já tem vai aqui)
+            latents_parts = self._dividir_latentes_por_tamanho(latents_cpu, 4, 1) # Exemplo de divisão
+            partes_mp4 = []
+            for i, part_latents in enumerate(latents_parts):
+                output_part_path = os.path.join(temp_dir, f"part_{i}.mp4")
+                pixel_tensor = vae_manager_singleton.decode(part_latents.to(self.device), decode_timestep=0.05)
+                video_encode_tool_singleton.save_video_from_tensor(pixel_tensor, output_part_path, fps=FPS)
+                partes_mp4.append(output_part_path)
+            final_concat_path = os.path.join(results_dir, f"concat_fim_{used_seed}.mp4")
+            if len(partes_mp4) > 1:
+                 # Sua lógica de _gerar_lista_com_transicoes e _concat_mp4s_no_reencode
+                 # ...
+                 pass # Substitua pelo seu código
             else:
+                 shutil.copy(partes_mp4[0], final_concat_path)
             self._log_gpu_memory("Fim da Geração")
+            return final_concat_path, used_seed
         except Exception as e:
             print("[DEBUG] EXCEÇÃO NA GERAÇÃO:")
             print("".join(traceback.format_exception(type(e), e, e.__traceback__)))
                 print(f"[DEBUG] finalize() no finally falhou: {e}")
 print("Criando instância do VideoService. O carregamento do modelo começará agora...")
+video_generation_service = VideoService(