Test3

Paused

App Files Files Community

EuuIia commited on Oct 3

Commit

5e7dd18

verified ·

1 Parent(s): 690fc1d

Upload ltx_server.py

Browse files

Files changed (1) hide show

api/ltx_server.py +227 -151

api/ltx_server.py CHANGED Viewed

@@ -1,3 +1,7 @@
 import torch
 import numpy as np
 import random
@@ -14,12 +18,15 @@ import subprocess
 import gc
 import shutil
 import contextlib
 # --- 2. GERENCIAMENTO DE DEPENDÊNCIAS E SETUP ---
 def _query_gpu_processes_via_nvml(device_index: int) -> List[Dict]:
     try:
         import psutil
         import pynvml as nvml
         nvml.nvmlInit()
         handle = nvml.nvmlDeviceGetHandleByIndex(device_index)
         try:
@@ -44,25 +51,29 @@ def _query_gpu_processes_via_nvml(device_index: int) -> List[Dict]:
             except Exception:
                 pass
             results.append({"pid": pid, "name": name, "user": user, "used_mb": used_mb})
         nvml.nvmlShutdown()
         return results
-    except Exception:
         return []
 def _query_gpu_processes_via_nvidiasmi(device_index: int) -> List[Dict]:
     cmd = f"nvidia-smi -i {device_index} --query-compute-apps=pid,process_name,used_memory --format=csv,noheader,nounits"
     try:
         out = subprocess.check_output(shlex.split(cmd), stderr=subprocess.STDOUT, text=True, timeout=2.0)
-    except Exception:
         return []
     results = []
     for line in out.strip().splitlines():
         parts = [p.strip() for p in line.split(",")]
         if len(parts) >= 3:
             try:
-                pid = int(parts[0])
-                name = parts[1]
-                used_mb = int(parts[2])
                 user = "unknown"
                 try:
                     import psutil
@@ -87,30 +98,29 @@ def _gpu_process_table(processes: List[Dict], current_pid: int) -> str:
     return "\n".join(lines) + "\n"
 def run_setup():
-    """Executa o script setup.py para clonar as dependências necessárias."""
     setup_script_path = "setup.py"
     if not os.path.exists(setup_script_path):
-        print("AVISO: script 'setup.py' não encontrado. Pulando a clonagem de dependências.")
         return
     try:
-        print("--- Executando setup.py para garantir que as dependências estão presentes ---")
         subprocess.run([sys.executable, setup_script_path], check=True)
-        print("--- Setup concluído com sucesso ---")
     except subprocess.CalledProcessError as e:
-        print(f"ERRO CRÍTICO DURANTE O SETUP: 'setup.py' falhou com código {e.returncode}.")
         sys.exit(1)
 DEPS_DIR = Path("/data")
 LTX_VIDEO_REPO_DIR = DEPS_DIR / "LTX-Video"
 if not LTX_VIDEO_REPO_DIR.exists():
     run_setup()
 def add_deps_to_path():
-    """Adiciona o repositório clonado ao sys.path para que suas bibliotecas possam ser importadas."""
-    if not LTX_VIDEO_REPO_DIR.exists():
-        raise FileNotFoundError(f"Repositório LTX-Video não encontrado em '{LTX_VIDEO_REPO_DIR}'. Execute o setup.")
     if str(LTX_VIDEO_REPO_DIR.resolve()) not in sys.path:
-        sys.path.insert(0, str(LTX_VIDEO_REPO_DIR.resolve()))
 add_deps_to_path()
@@ -129,44 +139,51 @@ from ltx_video.utils.skip_layer_strategy import SkipLayerStrategy
 # --- 4. FUNÇÕES HELPER DE LOG ---
 def log_tensor_info(tensor, name="Tensor"):
     if not isinstance(tensor, torch.Tensor):
-        print(f"\n[INFO] O item '{name}' não é um tensor para logar.")
         return
-    print(f"\n--- Informações do Tensor: {name} ---")
-    print(f"  - Shape: {tensor.shape}")
     print(f"  - Dtype: {tensor.dtype}")
     print(f"  - Device: {tensor.device}")
     if tensor.numel() > 0:
-        print(f"  - Min valor: {tensor.min().item():.4f}")
-        print(f"  - Max valor: {tensor.max().item():.4f}")
-        print(f"  - Média: {tensor.mean().item():.4f}")
-    else:
-        print("  - O tensor está vazio, sem estatísticas.")
     print("------------------------------------------\n")
 # --- 5. CLASSE PRINCIPAL DO SERVIÇO ---
 class VideoService:
     def __init__(self):
-        print("Inicializando VideoService...")
         self.config = self._load_config()
         self.device = "cuda" if torch.cuda.is_available() else "cpu"
         self.last_memory_reserved_mb = 0.0
         self._tmp_dirs = set()
         self._tmp_files = set()
         self._last_outputs = []
         self.pipeline, self.latent_upsampler = self._load_models()
-        print(f"Movendo modelos para o dispositivo de inferência: {self.device}")
         self.pipeline.to(self.device)
         if self.latent_upsampler:
             self.latent_upsampler.to(self.device)
-        # Política de precisão (FP8 opcional + autocast coerente)
         self._apply_precision_policy()
         if self.device == "cuda":
             torch.cuda.empty_cache()
             self._log_gpu_memory("Após carregar modelos")
-        print("VideoService pronto para uso.")
     def _log_gpu_memory(self, stage_name: str):
         if self.device != "cuda":
@@ -181,51 +198,54 @@ class VideoService:
         processes = _query_gpu_processes_via_nvml(device_index)
         if not processes:
             processes = _query_gpu_processes_via_nvidiasmi(device_index)
-        print(f"\n--- [LOG DE MEMÓRIA GPU] - {stage_name} (cuda:{device_index}) ---")
-        print(f"  - Uso Atual (Reservado): {current_reserved_mb:.2f} MB / {total_memory_mb:.2f} MB")
-        print(f"  - Variação desde o último log: {delta_mb:+.2f} MB")
         if peak_reserved_mb > getattr(self, "last_memory_reserved_mb", 0.0):
-            print(f"  - Pico de Uso (nesta operação): {peak_reserved_mb:.2f} MB")
         print(_gpu_process_table(processes, os.getpid()), end="")
         print("--------------------------------------------------\n")
         self.last_memory_reserved_mb = current_reserved_mb
     def _register_tmp_dir(self, d: str):
-        try:
-            if d and os.path.isdir(d):
-                self._tmp_dirs.add(d)
-        except Exception:
-            pass
     def _register_tmp_file(self, f: str):
-        try:
-            if f and os.path.isfile(f):
-                self._tmp_files.add(f)
-        except Exception:
-            pass
     def finalize(self, keep_paths=None, extra_paths=None, clear_gpu=True):
         keep = set(keep_paths or [])
         extras = set(extra_paths or [])
         for f in list(self._tmp_files | extras):
             try:
                 if f not in keep and os.path.isfile(f):
                     os.remove(f)
-            except Exception:
-                pass
             finally:
                 self._tmp_files.discard(f)
         for d in list(self._tmp_dirs):
             try:
                 if d not in keep and os.path.isdir(d):
                     shutil.rmtree(d, ignore_errors=True)
-            except Exception:
-                pass
             finally:
                 self._tmp_dirs.discard(d)
         gc.collect()
         try:
             if clear_gpu and torch.cuda.is_available():
@@ -234,13 +254,13 @@ class VideoService:
                     torch.cuda.ipc_collect()
                 except Exception:
                     pass
-        except Exception:
-            pass
         try:
             self._log_gpu_memory("Após finalize")
-        except Exception:
-            pass
     def _load_config(self):
         base = LTX_VIDEO_REPO_DIR / "configs"
@@ -252,15 +272,18 @@ class VideoService:
         ]
         for cfg in candidates:
             if cfg.exists():
                 with open(cfg, "r") as file:
                     return yaml.safe_load(file)
-        config_file_path = base / "ltxv-13b-0.9.8-distilled.yaml"
-        with open(config_file_path, "r") as file:
             return yaml.safe_load(file)
     def _load_models(self):
         LTX_REPO = "Lightricks/LTX-Video"
         distilled_model_path = hf_hub_download(
             repo_id=LTX_REPO,
             filename=self.config["checkpoint_path"],
@@ -269,7 +292,9 @@ class VideoService:
             token=os.getenv("HF_TOKEN"),
         )
         self.config["checkpoint_path"] = distilled_model_path
         spatial_upscaler_path = hf_hub_download(
             repo_id=LTX_REPO,
             filename=self.config["spatial_upscaler_model_path"],
@@ -278,7 +303,9 @@ class VideoService:
             token=os.getenv("HF_TOKEN"),
         )
         self.config["spatial_upscaler_model_path"] = spatial_upscaler_path
         pipeline = create_ltx_video_pipeline(
             ckpt_path=self.config["checkpoint_path"],
             precision=self.config["precision"],
@@ -289,49 +316,60 @@ class VideoService:
             prompt_enhancer_image_caption_model_name_or_path=self.config["prompt_enhancer_image_caption_model_name_or_path"],
             prompt_enhancer_llm_model_name_or_path=self.config["prompt_enhancer_llm_model_name_or_path"],
         )
         latent_upsampler = None
         if self.config.get("spatial_upscaler_model_path"):
             latent_upsampler = create_latent_upsampler(self.config["spatial_upscaler_model_path"], device="cpu")
         return pipeline, latent_upsampler
     def _promote_fp8_weights_to_bf16(self, module):
         if not isinstance(module, torch.nn.Module):
             return
         f8 = getattr(torch, "float8_e4m3fn", None)
         if f8 is None:
             return
         for _, p in module.named_parameters(recurse=True):
             try:
                 if p.dtype == f8:
                     with torch.no_grad():
                         p.data = p.data.to(torch.bfloat16)
             except Exception:
                 pass
         for _, b in module.named_buffers(recurse=True):
             try:
                 if hasattr(b, "dtype") and b.dtype == f8:
                     b.data = b.data.to(torch.bfloat16)
             except Exception:
                 pass
     def _apply_precision_policy(self):
         prec = str(self.config.get("precision", "")).lower()
         self.runtime_autocast_dtype = torch.float32
         if prec == "float8_e4m3fn":
             self.runtime_autocast_dtype = torch.bfloat16
             force_promote = os.getenv("LTXV_FORCE_BF16_ON_FP8", "0") == "1"
             if force_promote and hasattr(torch, "float8_e4m3fn"):
                 try:
                     self._promote_fp8_weights_to_bf16(self.pipeline)
-                except Exception:
-                    pass
                 try:
                     if self.latent_upsampler:
                         self._promote_fp8_weights_to_bf16(self.latent_upsampler)
-                except Exception:
-                    pass
         elif prec == "bfloat16":
             self.runtime_autocast_dtype = torch.bfloat16
         elif prec == "mixed_precision":
@@ -340,35 +378,41 @@ class VideoService:
             self.runtime_autocast_dtype = torch.float32
     def _prepare_conditioning_tensor(self, filepath, height, width, padding_values):
         tensor = load_image_to_tensor_with_resize_and_crop(filepath, height, width)
         tensor = torch.nn.functional.pad(tensor, padding_values)
-        if self.device == "cuda":
-            return tensor.to(self.device, dtype=self.runtime_autocast_dtype)
-        return tensor.to(self.device)
-    # Nova: decodificação de latentes fora da pipeline com VAE e escrita incremental
     def _decode_latents_to_video(self, latents: torch.Tensor, output_video_path: str, frame_rate: int,
                                  padding_values, progress_callback=None):
         pad_left, pad_right, pad_top, pad_bottom = padding_values
         with imageio.get_writer(output_video_path, fps=frame_rate, codec="libx264", quality=8) as writer:
-            T = latents.shape[2]
             for i in range(T):
                 latent_chw = latents[0, :, i].to(self.device)
                 with torch.autocast(device_type="cuda", dtype=self.runtime_autocast_dtype) if self.device == "cuda" else contextlib.nullcontext():
                     pixel_bchw = None
                     if hasattr(self.pipeline, "decode_latents"):
                         pixel_bchw = self.pipeline.decode_latents(latent_chw.unsqueeze(0))
                     elif hasattr(self.pipeline, "vae") and hasattr(self.pipeline.vae, "decode"):
                         pixel_bchw = self.pipeline.vae.decode(latent_chw.unsqueeze(0))
                     else:
-                        raise RuntimeError("Pipeline não expõe decode_latents nem vae.decode para decodificar latentes.")
-                pixel_chw = pixel_bchw[0]
                 if pixel_chw.min() < 0:
                     pixel_chw = (pixel_chw.clamp(-1, 1) + 1.0) / 2.0
                 else:
                     pixel_chw = pixel_chw.clamp(0, 1)
-                H = pixel_chw.shape[1]
-                W = pixel_chw.shape[2]
                 h_end = H - pad_bottom if pad_bottom > 0 else H
                 w_end = W - pad_right if pad_right > 0 else W
                 pixel_chw = pixel_chw[:, pad_top:h_end, pad_left:w_end]
@@ -380,6 +424,9 @@ class VideoService:
                 writer.append_data(frame_hwc_u8)
                 if progress_callback:
                     progress_callback(i + 1, T)
     def generate(
         self,
@@ -402,8 +449,10 @@ class VideoService:
         guidance_scale=3.0,
         improve_texture=True,
         progress_callback=None,
-        external_decode=True,  # NOVO: decodificar fora da pipeline
     ):
         if self.device == "cuda":
             torch.cuda.empty_cache()
             torch.cuda.reset_peak_memory_stats()
@@ -416,16 +465,19 @@ class VideoService:
         used_seed = random.randint(0, 2**32 - 1) if randomize_seed else int(seed)
         seed_everething(used_seed)
         FPS = 24.0
         MAX_NUM_FRAMES = 257
         target_frames_rounded = round(duration * FPS)
         n_val = round((float(target_frames_rounded) - 1.0) / 8.0)
         actual_num_frames = max(9, min(MAX_NUM_FRAMES, int(n_val * 8 + 1)))
         height_padded = ((height - 1) // 32 + 1) * 32
         width_padded = ((width - 1) // 32 + 1) * 32
         padding_values = calculate_padding(height, width, height_padded, width_padded)
         generator = torch.Generator(device=self.device).manual_seed(used_seed)
         conditioning_items = []
@@ -441,6 +493,7 @@ class VideoService:
                 end_tensor = self._prepare_conditioning_tensor(end_image_filepath, height, width, padding_values)
                 last_frame_index = actual_num_frames - 1
                 conditioning_items.append(ConditioningItem(end_tensor, last_frame_index, float(end_image_weight)))
         call_kwargs = {
             "prompt": prompt,
@@ -450,7 +503,7 @@ class VideoService:
             "num_frames": actual_num_frames,
             "frame_rate": int(FPS),
             "generator": generator,
-            "output_type": "latent" if external_decode else "pt",  # aqui alternamos o tipo de saída
             "conditioning_items": conditioning_items if conditioning_items else None,
             "media_items": None,
             "decode_timestep": self.config["decode_timestep"],
@@ -464,92 +517,111 @@ class VideoService:
             "enhance_prompt": False,
             "skip_layer_strategy": SkipLayerStrategy.AttentionValues,
         }
         if mode == "video-to-video":
-            call_kwargs["media_items"] = load_media_file(
                 media_path=input_video_filepath,
                 height=height,
                 width=width,
                 max_frames=int(frames_to_use),
                 padding=padding_values,
             ).to(self.device)
         latents = None
         result_tensor = None
         multi_scale_pipeline = None
-        if improve_texture:
-            if not self.latent_upsampler:
-                raise ValueError("Upscaler espacial não carregado.")
-            multi_scale_pipeline = LTXMultiScalePipeline(self.pipeline, self.latent_upsampler)
-            first_pass_args = self.config.get("first_pass", {}).copy()
-            first_pass_args["guidance_scale"] = float(guidance_scale)
-            second_pass_args = self.config.get("second_pass", {}).copy()
-            second_pass_args["guidance_scale"] = float(guidance_scale)
-            multi_scale_call_kwargs = call_kwargs.copy()
-            multi_scale_call_kwargs.update(
-                {
-                    "downscale_factor": self.config["downscale_factor"],
-                    "first_pass": first_pass_args,
-                    "second_pass": second_pass_args,
-                }
-            )
-            ctx = contextlib.nullcontext()
-            if self.device == "cuda":
-                ctx = torch.autocast(device_type="cuda", dtype=self.runtime_autocast_dtype)
-            with ctx:
-                result = multi_scale_pipeline(**multi_scale_call_kwargs)
-            # Captura latentes ou imagens conforme o output_type
-            if external_decode:
-                latents = getattr(result, "latents", None) or getattr(result, "images", None) or result
-            else:
-                result_tensor = getattr(result, "images", None) or result
-            if not external_decode:
-                log_tensor_info(result_tensor, "Resultado da Etapa 2 (Saída do Pipeline Multi-Scale)")
-        else:
-            single_pass_kwargs = call_kwargs.copy()
-            first_pass_config = self.config.get("first_pass", {})
-            single_pass_kwargs.update(
-                {
-                    "guidance_scale": float(guidance_scale),
-                    "stg_scale": first_pass_config.get("stg_scale"),
-                    "rescaling_scale": first_pass_config.get("rescaling_scale"),
-                    "skip_block_list": first_pass_config.get("skip_block_list"),
-                }
-            )
-            # Agenda única para guidance_mapping consistente
-            schedule = first_pass_config.get("timesteps")
-            if schedule is None:
-                schedule = first_pass_config.get("guidance_timesteps")
-            if mode == "video-to-video":
-                schedule = [0.7]
-                print("[INFO] Modo video-to-video (etapa única): definindo timesteps (força) para [0.7]")
-            if isinstance(schedule, (list, tuple)) and len(schedule) > 0:
-                single_pass_kwargs["timesteps"] = schedule
-                single_pass_kwargs["guidance_timesteps"] = schedule
-            print("\n[INFO] Executando pipeline de etapa única...")
-            ctx = contextlib.nullcontext()
-            if self.device == "cuda":
-                ctx = torch.autocast(device_type="cuda", dtype=self.runtime_autocast_dtype)
-            with ctx:
-                result = self.pipeline(**single_pass_kwargs)
-            if external_decode:
-                latents = getattr(result, "latents", None) or getattr(result, "images", None) or result
             else:
-                result_tensor = getattr(result, "images", None) or result
-        # Staging seguro em tmp e move para diretório persistente
-        temp_dir = tempfile.mkdtemp(prefix="ltxv_")
-        self._register_tmp_dir(temp_dir)
-        results_dir = "/app/output"
-        os.makedirs(results_dir, exist_ok=True)
-        final_output_path = None
-        output_video_path = os.path.join(temp_dir, f"output_{used_seed}.mp4")
-        try:
             if external_decode:
-                # Decodifica latentes -> MP4, quadro a quadro
                 self._decode_latents_to_video(
                     latents=latents,
                     output_video_path=output_video_path,
@@ -558,15 +630,9 @@ class VideoService:
                     progress_callback=progress_callback,
                 )
             else:
-                # Caminho antigo: tensor já em espaço de pixels -> escrever quadro a quadro
-                # Aplicar corte de padding antes de escrever
-                pad_left, pad_right, pad_top, pad_bottom = padding_values
-                slice_h_end = -pad_bottom if pad_bottom > 0 else None
-                slice_w_end = -pad_right if pad_right > 0 else None
-                result_tensor = result_tensor[:, :, :actual_num_frames, pad_top:slice_h_end, pad_left:slice_w_end]
-                log_tensor_info(result_tensor, "Tensor Final (Após Pós-processamento, Antes de Salvar)")
                 with imageio.get_writer(output_video_path, fps=call_kwargs["frame_rate"], codec="libx264", quality=8) as writer:
-                    T = result_tensor.shape[2]
                     for i in range(T):
                         frame_chw = result_tensor[0, :, i]
                         frame_hwc_u8 = (frame_chw.permute(1, 2, 0)
@@ -578,17 +644,27 @@ class VideoService:
                         writer.append_data(frame_hwc_u8)
                         if progress_callback:
                             progress_callback(i + 1, T)
             candidate_final = os.path.join(results_dir, f"output_{used_seed}.mp4")
             try:
                 shutil.move(output_video_path, candidate_final)
                 final_output_path = candidate_final
-            except Exception:
                 final_output_path = output_video_path
-            self._register_tmp_file(output_video_path)
             self._log_gpu_memory("Fim da Geração")
             return final_output_path, used_seed
         finally:
             try:
                 del latents
@@ -611,13 +687,13 @@ class VideoService:
                         torch.cuda.ipc_collect()
                     except Exception:
                         pass
-            except Exception:
-                pass
             try:
-                self.finalize(keep_paths=[final_output_path] if final_output_path else [])
-            except Exception:
-                pass
 print("Criando instância do VideoService. O carregamento do modelo começará agora...")
 video_generation_service = VideoService()

+# ltx_server.py — VideoService com logs de depuração detalhados (init→MP4)
+# Opção external_decode: True (default) decodifica latentes com VAE fora da pipeline.
+# --- 1. IMPORTAÇÕES ---
 import torch
 import numpy as np
 import random
 import gc
 import shutil
 import contextlib
+import time
+import traceback
 # --- 2. GERENCIAMENTO DE DEPENDÊNCIAS E SETUP ---
 def _query_gpu_processes_via_nvml(device_index: int) -> List[Dict]:
     try:
         import psutil
         import pynvml as nvml
+        print("[DEBUG] NVML: inicializando para consulta de processos...")
         nvml.nvmlInit()
         handle = nvml.nvmlDeviceGetHandleByIndex(device_index)
         try:
             except Exception:
                 pass
             results.append({"pid": pid, "name": name, "user": user, "used_mb": used_mb})
+        print("[DEBUG] NVML: finalizando...")
         nvml.nvmlShutdown()
         return results
+    except Exception as e:
+        print(f"[DEBUG] NVML indisponível ou falhou: {e}")
         return []
 def _query_gpu_processes_via_nvidiasmi(device_index: int) -> List[Dict]:
     cmd = f"nvidia-smi -i {device_index} --query-compute-apps=pid,process_name,used_memory --format=csv,noheader,nounits"
     try:
+        print(f"[DEBUG] Rodando: {cmd}")
         out = subprocess.check_output(shlex.split(cmd), stderr=subprocess.STDOUT, text=True, timeout=2.0)
+    except Exception as e:
+        print(f"[DEBUG] nvidia-smi falhou: {e}")
         return []
     results = []
     for line in out.strip().splitlines():
         parts = [p.strip() for p in line.split(",")]
         if len(parts) >= 3:
             try:
+                pid = int(parts[^21_0])
+                name = parts[^21_1]
+                used_mb = int(parts[^21_2])
                 user = "unknown"
                 try:
                     import psutil
     return "\n".join(lines) + "\n"
 def run_setup():
     setup_script_path = "setup.py"
     if not os.path.exists(setup_script_path):
+        print("[DEBUG] 'setup.py' não encontrado. Pulando clonagem de dependências.")
         return
     try:
+        print("[DEBUG] Executando setup.py para dependências...")
         subprocess.run([sys.executable, setup_script_path], check=True)
+        print("[DEBUG] Setup concluído com sucesso.")
     except subprocess.CalledProcessError as e:
+        print(f"[DEBUG] ERRO no setup.py (code {e.returncode}). Abortando.")
         sys.exit(1)
 DEPS_DIR = Path("/data")
 LTX_VIDEO_REPO_DIR = DEPS_DIR / "LTX-Video"
 if not LTX_VIDEO_REPO_DIR.exists():
+    print(f"[DEBUG] Repositório não encontrado em {LTX_VIDEO_REPO_DIR}. Rodando setup...")
     run_setup()
 def add_deps_to_path():
+    repo_path = str(LTX_VIDEO_REPO_DIR.resolve())
     if str(LTX_VIDEO_REPO_DIR.resolve()) not in sys.path:
+        sys.path.insert(0, repo_path)
+        print(f"[DEBUG] Repo adicionado ao sys.path: {repo_path}")
 add_deps_to_path()
 # --- 4. FUNÇÕES HELPER DE LOG ---
 def log_tensor_info(tensor, name="Tensor"):
     if not isinstance(tensor, torch.Tensor):
+        print(f"\n[INFO] '{name}' não é tensor.")
         return
+    print(f"\n--- Tensor: {name} ---")
+    print(f"  - Shape: {tuple(tensor.shape)}")
     print(f"  - Dtype: {tensor.dtype}")
     print(f"  - Device: {tensor.device}")
     if tensor.numel() > 0:
+        try:
+            print(f"  - Min: {tensor.min().item():.4f}  Max: {tensor.max().item():.4f}  Mean: {tensor.mean().item():.4f}")
+        except Exception:
+            pass
     print("------------------------------------------\n")
 # --- 5. CLASSE PRINCIPAL DO SERVIÇO ---
 class VideoService:
     def __init__(self):
+        t0 = time.perf_counter()
+        print("[DEBUG] Inicializando VideoService...")
+        self.debug = os.getenv("LTXV_DEBUG", "1") == "1"
+        self.frame_log_every = int(os.getenv("LTXV_FRAME_LOG_EVERY", "8"))
         self.config = self._load_config()
+        print(f"[DEBUG] Config carregada (precision={self.config.get('precision')}, sampler={self.config.get('sampler')})")
         self.device = "cuda" if torch.cuda.is_available() else "cpu"
+        print(f"[DEBUG] Device selecionado: {self.device}")
         self.last_memory_reserved_mb = 0.0
         self._tmp_dirs = set()
         self._tmp_files = set()
         self._last_outputs = []
         self.pipeline, self.latent_upsampler = self._load_models()
+        print(f"[DEBUG] Pipeline e Upsampler carregados. Upsampler ativo? {bool(self.latent_upsampler)}")
+        print(f"[DEBUG] Movendo modelos para {self.device}...")
         self.pipeline.to(self.device)
         if self.latent_upsampler:
             self.latent_upsampler.to(self.device)
         self._apply_precision_policy()
+        print(f"[DEBUG] runtime_autocast_dtype = {getattr(self, 'runtime_autocast_dtype', None)}")
         if self.device == "cuda":
             torch.cuda.empty_cache()
             self._log_gpu_memory("Após carregar modelos")
+        print(f"[DEBUG] VideoService pronto. boot_time={time.perf_counter()-t0:.3f}s")
     def _log_gpu_memory(self, stage_name: str):
         if self.device != "cuda":
         processes = _query_gpu_processes_via_nvml(device_index)
         if not processes:
             processes = _query_gpu_processes_via_nvidiasmi(device_index)
+        print(f"\n--- [LOG GPU] {stage_name} (cuda:{device_index}) ---")
+        print(f"  - Reservado: {current_reserved_mb:.2f} MB / {total_memory_mb:.2f} MB  (Δ={delta_mb:+.2f} MB)")
         if peak_reserved_mb > getattr(self, "last_memory_reserved_mb", 0.0):
+            print(f"  - Pico reservado (nesta fase): {peak_reserved_mb:.2f} MB")
         print(_gpu_process_table(processes, os.getpid()), end="")
         print("--------------------------------------------------\n")
         self.last_memory_reserved_mb = current_reserved_mb
     def _register_tmp_dir(self, d: str):
+        if d and os.path.isdir(d):
+            self._tmp_dirs.add(d)
+            print(f"[DEBUG] Registrado tmp dir: {d}")
     def _register_tmp_file(self, f: str):
+        if f and os.path.exists(f):
+            self._tmp_files.add(f)
+            print(f"[DEBUG] Registrado tmp file: {f}")
     def finalize(self, keep_paths=None, extra_paths=None, clear_gpu=True):
+        print("[DEBUG] Finalize: iniciando limpeza...")
         keep = set(keep_paths or [])
         extras = set(extra_paths or [])
+        removed_files = 0
         for f in list(self._tmp_files | extras):
             try:
                 if f not in keep and os.path.isfile(f):
                     os.remove(f)
+                    removed_files += 1
+                    print(f"[DEBUG] Removido arquivo tmp: {f}")
+            except Exception as e:
+                print(f"[DEBUG] Falha removendo arquivo {f}: {e}")
             finally:
                 self._tmp_files.discard(f)
+        removed_dirs = 0
         for d in list(self._tmp_dirs):
             try:
                 if d not in keep and os.path.isdir(d):
                     shutil.rmtree(d, ignore_errors=True)
+                    removed_dirs += 1
+                    print(f"[DEBUG] Removido diretório tmp: {d}")
+            except Exception as e:
+                print(f"[DEBUG] Falha removendo diretório {d}: {e}")
             finally:
                 self._tmp_dirs.discard(d)
+        print(f"[DEBUG] Finalize: arquivos removidos={removed_files}, dirs removidos={removed_dirs}")
         gc.collect()
         try:
             if clear_gpu and torch.cuda.is_available():
                     torch.cuda.ipc_collect()
                 except Exception:
                     pass
+        except Exception as e:
+            print(f"[DEBUG] Finalize: limpeza GPU falhou: {e}")
         try:
             self._log_gpu_memory("Após finalize")
+        except Exception as e:
+            print(f"[DEBUG] Log GPU pós-finalize falhou: {e}")
     def _load_config(self):
         base = LTX_VIDEO_REPO_DIR / "configs"
         ]
         for cfg in candidates:
             if cfg.exists():
+                print(f"[DEBUG] Config selecionada: {cfg}")
                 with open(cfg, "r") as file:
                     return yaml.safe_load(file)
+        cfg = base / "ltxv-13b-0.9.8-distilled.yaml"
+        print(f"[DEBUG] Config fallback: {cfg}")
+        with open(cfg, "r") as file:
             return yaml.safe_load(file)
     def _load_models(self):
+        t0 = time.perf_counter()
         LTX_REPO = "Lightricks/LTX-Video"
+        print("[DEBUG] Baixando checkpoint principal (hf_hub_download)...")
         distilled_model_path = hf_hub_download(
             repo_id=LTX_REPO,
             filename=self.config["checkpoint_path"],
             token=os.getenv("HF_TOKEN"),
         )
         self.config["checkpoint_path"] = distilled_model_path
+        print(f"[DEBUG] Checkpoint em: {distilled_model_path}")
+        print("[DEBUG] Baixando upscaler espacial (hf_hub_download)...")
         spatial_upscaler_path = hf_hub_download(
             repo_id=LTX_REPO,
             filename=self.config["spatial_upscaler_model_path"],
             token=os.getenv("HF_TOKEN"),
         )
         self.config["spatial_upscaler_model_path"] = spatial_upscaler_path
+        print(f"[DEBUG] Upscaler em: {spatial_upscaler_path}")
+        print("[DEBUG] Construindo pipeline...")
         pipeline = create_ltx_video_pipeline(
             ckpt_path=self.config["checkpoint_path"],
             precision=self.config["precision"],
             prompt_enhancer_image_caption_model_name_or_path=self.config["prompt_enhancer_image_caption_model_name_or_path"],
             prompt_enhancer_llm_model_name_or_path=self.config["prompt_enhancer_llm_model_name_or_path"],
         )
+        print("[DEBUG] Pipeline pronto.")
         latent_upsampler = None
         if self.config.get("spatial_upscaler_model_path"):
+            print("[DEBUG] Construindo latent_upsampler...")
             latent_upsampler = create_latent_upsampler(self.config["spatial_upscaler_model_path"], device="cpu")
+            print("[DEBUG] Upsampler pronto.")
+        print(f"[DEBUG] _load_models() tempo total={time.perf_counter()-t0:.3f}s")
         return pipeline, latent_upsampler
     def _promote_fp8_weights_to_bf16(self, module):
         if not isinstance(module, torch.nn.Module):
+            print("[DEBUG] Promoção FP8→BF16 ignorada: alvo não é nn.Module.")
             return
         f8 = getattr(torch, "float8_e4m3fn", None)
         if f8 is None:
+            print("[DEBUG] torch.float8_e4m3fn indisponível.")
             return
+        p_cnt = b_cnt = 0
         for _, p in module.named_parameters(recurse=True):
             try:
                 if p.dtype == f8:
                     with torch.no_grad():
                         p.data = p.data.to(torch.bfloat16)
+                        p_cnt += 1
             except Exception:
                 pass
         for _, b in module.named_buffers(recurse=True):
             try:
                 if hasattr(b, "dtype") and b.dtype == f8:
                     b.data = b.data.to(torch.bfloat16)
+                    b_cnt += 1
             except Exception:
                 pass
+        print(f"[DEBUG] FP8→BF16: params_promoted={p_cnt}, buffers_promoted={b_cnt}")
     def _apply_precision_policy(self):
         prec = str(self.config.get("precision", "")).lower()
         self.runtime_autocast_dtype = torch.float32
+        print(f"[DEBUG] Aplicando política de precisão: {prec}")
         if prec == "float8_e4m3fn":
             self.runtime_autocast_dtype = torch.bfloat16
             force_promote = os.getenv("LTXV_FORCE_BF16_ON_FP8", "0") == "1"
+            print(f"[DEBUG] FP8 detectado. force_promote={force_promote}")
             if force_promote and hasattr(torch, "float8_e4m3fn"):
                 try:
                     self._promote_fp8_weights_to_bf16(self.pipeline)
+                except Exception as e:
+                    print(f"[DEBUG] Promoção FP8→BF16 na pipeline falhou: {e}")
                 try:
                     if self.latent_upsampler:
                         self._promote_fp8_weights_to_bf16(self.latent_upsampler)
+                except Exception as e:
+                    print(f"[DEBUG] Promoção FP8→BF16 no upsampler falhou: {e}")
         elif prec == "bfloat16":
             self.runtime_autocast_dtype = torch.bfloat16
         elif prec == "mixed_precision":
             self.runtime_autocast_dtype = torch.float32
     def _prepare_conditioning_tensor(self, filepath, height, width, padding_values):
+        print(f"[DEBUG] Carregando condicionamento: {filepath}")
         tensor = load_image_to_tensor_with_resize_and_crop(filepath, height, width)
         tensor = torch.nn.functional.pad(tensor, padding_values)
+        out = tensor.to(self.device, dtype=self.runtime_autocast_dtype) if self.device == "cuda" else tensor.to(self.device)
+        print(f"[DEBUG] Cond shape={tuple(out.shape)} dtype={out.dtype} device={out.device}")
+        return out
     def _decode_latents_to_video(self, latents: torch.Tensor, output_video_path: str, frame_rate: int,
                                  padding_values, progress_callback=None):
+        print(f"[DEBUG] Decodificando latentes → vídeo: {output_video_path}")
         pad_left, pad_right, pad_top, pad_bottom = padding_values
+        T = latents.shape[^21_2]
+        print(f"[DEBUG] Latentes shape={tuple(latents.shape)} frames={T}")
+        start = time.perf_counter()
         with imageio.get_writer(output_video_path, fps=frame_rate, codec="libx264", quality=8) as writer:
             for i in range(T):
                 latent_chw = latents[0, :, i].to(self.device)
                 with torch.autocast(device_type="cuda", dtype=self.runtime_autocast_dtype) if self.device == "cuda" else contextlib.nullcontext():
                     pixel_bchw = None
                     if hasattr(self.pipeline, "decode_latents"):
                         pixel_bchw = self.pipeline.decode_latents(latent_chw.unsqueeze(0))
+                        if i % self.frame_log_every == 0:
+                            print(f"[DEBUG] decode_latents frame={i}")
                     elif hasattr(self.pipeline, "vae") and hasattr(self.pipeline.vae, "decode"):
                         pixel_bchw = self.pipeline.vae.decode(latent_chw.unsqueeze(0))
+                        if i % self.frame_log_every == 0:
+                            print(f"[DEBUG] vae.decode frame={i}")
                     else:
+                        raise RuntimeError("Pipeline não possui decode_latents/vae.decode.")
+                pixel_chw = pixel_bchw[^21_0]
                 if pixel_chw.min() < 0:
                     pixel_chw = (pixel_chw.clamp(-1, 1) + 1.0) / 2.0
                 else:
                     pixel_chw = pixel_chw.clamp(0, 1)
+                H, W = pixel_chw.shape[^21_1], pixel_chw.shape[^21_2]
                 h_end = H - pad_bottom if pad_bottom > 0 else H
                 w_end = W - pad_right if pad_right > 0 else W
                 pixel_chw = pixel_chw[:, pad_top:h_end, pad_left:w_end]
                 writer.append_data(frame_hwc_u8)
                 if progress_callback:
                     progress_callback(i + 1, T)
+                if i % self.frame_log_every == 0:
+                    print(f"[DEBUG] frame {i}/{T} escrito.")
+        print(f"[DEBUG] Decodificação+escrita concluída em {time.perf_counter()-start:.3f}s")
     def generate(
         self,
         guidance_scale=3.0,
         improve_texture=True,
         progress_callback=None,
+        external_decode=True,
     ):
+        t_all = time.perf_counter()
+        print(f"[DEBUG] generate() begin mode={mode} external_decode={external_decode} improve_texture={improve_texture}")
         if self.device == "cuda":
             torch.cuda.empty_cache()
             torch.cuda.reset_peak_memory_stats()
         used_seed = random.randint(0, 2**32 - 1) if randomize_seed else int(seed)
         seed_everething(used_seed)
+        print(f"[DEBUG] Seed usado: {used_seed}")
         FPS = 24.0
         MAX_NUM_FRAMES = 257
         target_frames_rounded = round(duration * FPS)
         n_val = round((float(target_frames_rounded) - 1.0) / 8.0)
         actual_num_frames = max(9, min(MAX_NUM_FRAMES, int(n_val * 8 + 1)))
+        print(f"[DEBUG] Frames alvo: {actual_num_frames} (dur={duration}s @ {FPS}fps)")
         height_padded = ((height - 1) // 32 + 1) * 32
         width_padded = ((width - 1) // 32 + 1) * 32
         padding_values = calculate_padding(height, width, height_padded, width_padded)
+        print(f"[DEBUG] Dimensões: ({height},{width}) -> pad ({height_padded},{width_padded}); padding={padding_values}")
         generator = torch.Generator(device=self.device).manual_seed(used_seed)
         conditioning_items = []
                 end_tensor = self._prepare_conditioning_tensor(end_image_filepath, height, width, padding_values)
                 last_frame_index = actual_num_frames - 1
                 conditioning_items.append(ConditioningItem(end_tensor, last_frame_index, float(end_image_weight)))
+            print(f"[DEBUG] Conditioning items: {len(conditioning_items)}")
         call_kwargs = {
             "prompt": prompt,
             "num_frames": actual_num_frames,
             "frame_rate": int(FPS),
             "generator": generator,
+            "output_type": "latent" if external_decode else "pt",
             "conditioning_items": conditioning_items if conditioning_items else None,
             "media_items": None,
             "decode_timestep": self.config["decode_timestep"],
             "enhance_prompt": False,
             "skip_layer_strategy": SkipLayerStrategy.AttentionValues,
         }
+        print(f"[DEBUG] call_kwargs.output_type={call_kwargs['output_type']} skip_layer_strategy={call_kwargs['skip_layer_strategy']}")
         if mode == "video-to-video":
+            media = load_media_file(
                 media_path=input_video_filepath,
                 height=height,
                 width=width,
                 max_frames=int(frames_to_use),
                 padding=padding_values,
             ).to(self.device)
+            call_kwargs["media_items"] = media
+            print(f"[DEBUG] media_items shape={tuple(media.shape)}")
         latents = None
         result_tensor = None
         multi_scale_pipeline = None
+        try:
+            if improve_texture:
+                if not self.latent_upsampler:
+                    raise ValueError("Upscaler espacial não carregado.")
+                print("[DEBUG] Multi-escala: construindo pipeline...")
+                multi_scale_pipeline = LTXMultiScalePipeline(self.pipeline, self.latent_upsampler)
+                first_pass_args = self.config.get("first_pass", {}).copy()
+                first_pass_args["guidance_scale"] = float(guidance_scale)
+                second_pass_args = self.config.get("second_pass", {}).copy()
+                second_pass_args["guidance_scale"] = float(guidance_scale)
+                multi_scale_call_kwargs = call_kwargs.copy()
+                multi_scale_call_kwargs.update(
+                    {
+                        "downscale_factor": self.config["downscale_factor"],
+                        "first_pass": first_pass_args,
+                        "second_pass": second_pass_args,
+                    }
+                )
+                print("[DEBUG] Chamando multi_scale_pipeline...")
+                t_ms = time.perf_counter()
+                ctx = torch.autocast(device_type="cuda", dtype=self.runtime_autocast_dtype) if self.device == "cuda" else contextlib.nullcontext()
+                with ctx:
+                    result = multi_scale_pipeline(**multi_scale_call_kwargs)
+                print(f"[DEBUG] multi_scale_pipeline tempo={time.perf_counter()-t_ms:.3f}s")
+                if external_decode:
+                    if hasattr(result, "latents"):
+                        latents = result.latents
+                    elif hasattr(result, "images") and isinstance(result.images, torch.Tensor):
+                        latents = result.images
+                    else:
+                        latents = result
+                    print(f"[DEBUG] Latentes obtidos (multi-escala): shape={tuple(latents.shape)}")
+                else:
+                    result_tensor = result.images if hasattr(result, "images") else result
+                    print(f"[DEBUG] Pixels obtidos (multi-escala): shape={tuple(result_tensor.shape)}")
+                    log_tensor_info(result_tensor, "Saída Multi-Scale (pixel)")
             else:
+                single_pass_kwargs = call_kwargs.copy()
+                first_pass_config = self.config.get("first_pass", {})
+                single_pass_kwargs.update(
+                    {
+                        "guidance_scale": float(guidance_scale),
+                        "stg_scale": first_pass_config.get("stg_scale"),
+                        "rescaling_scale": first_pass_config.get("rescaling_scale"),
+                        "skip_block_list": first_pass_config.get("skip_block_list"),
+                    }
+                )
+                schedule = first_pass_config.get("timesteps")
+                if schedule is None:
+                    schedule = first_pass_config.get("guidance_timesteps")
+                if mode == "video-to-video":
+                    schedule = [0.7]
+                    print("[INFO] Modo video-to-video (etapa única): timesteps=[0.7]")
+                if isinstance(schedule, (list, tuple)) and len(schedule) > 0:
+                    single_pass_kwargs["timesteps"] = schedule
+                    single_pass_kwargs["guidance_timesteps"] = schedule
+                print(f"[DEBUG] Single-pass: timesteps_len={len(schedule) if schedule else 0}")
+                print("\n[INFO] Executando pipeline de etapa única...")
+                t_sp = time.perf_counter()
+                ctx = torch.autocast(device_type="cuda", dtype=self.runtime_autocast_dtype) if self.device == "cuda" else contextlib.nullcontext()
+                with ctx:
+                    result = self.pipeline(**single_pass_kwargs)
+                print(f"[DEBUG] single-pass tempo={time.perf_counter()-t_sp:.3f}s")
+                if external_decode:
+                    if hasattr(result, "latents"):
+                        latents = result.latents
+                    elif hasattr(result, "images") and isinstance(result.images, torch.Tensor):
+                        latents = result.images
+                    else:
+                        latents = result
+                    print(f"[DEBUG] Latentes obtidos (single-pass): shape={tuple(latents.shape)}")
+                else:
+                    result_tensor = result.images if hasattr(result, "images") else result
+                    print(f"[DEBUG] Pixels obtidos (single-pass): shape={tuple(result_tensor.shape)}")
+            temp_dir = tempfile.mkdtemp(prefix="ltxv_")
+            self._register_tmp_dir(temp_dir)
+            results_dir = "/app/output"
+            os.makedirs(results_dir, exist_ok=True)
+            output_video_path = os.path.join(temp_dir, f"output_{used_seed}.mp4")
+            final_output_path = None
             if external_decode:
+                print("[DEBUG] Iniciando decodificação de latentes → MP4...")
                 self._decode_latents_to_video(
                     latents=latents,
                     output_video_path=output_video_path,
                     progress_callback=progress_callback,
                 )
             else:
+                print("[DEBUG] Escrevendo vídeo a partir de pixels (sem latentes)...")
                 with imageio.get_writer(output_video_path, fps=call_kwargs["frame_rate"], codec="libx264", quality=8) as writer:
+                    T = result_tensor.shape[^21_2]
                     for i in range(T):
                         frame_chw = result_tensor[0, :, i]
                         frame_hwc_u8 = (frame_chw.permute(1, 2, 0)
                         writer.append_data(frame_hwc_u8)
                         if progress_callback:
                             progress_callback(i + 1, T)
+                        if i % self.frame_log_every == 0:
+                            print(f"[DEBUG] frame {i}/{T} escrito (pixel).")
             candidate_final = os.path.join(results_dir, f"output_{used_seed}.mp4")
             try:
                 shutil.move(output_video_path, candidate_final)
                 final_output_path = candidate_final
+                print(f"[DEBUG] MP4 movido para {final_output_path}")
+            except Exception as e:
                 final_output_path = output_video_path
+                print(f"[DEBUG] Falha no move; usando tmp como final: {e}")
+            self._register_tmp_file(output_video_path)
             self._log_gpu_memory("Fim da Geração")
+            print(f"[DEBUG] generate() fim ok. total_time={time.perf_counter()-t_all:.3f}s")
             return final_output_path, used_seed
+        except Exception as e:
+            print("[DEBUG] EXCEÇÃO NA GERAÇÃO:")
+            print("".join(traceback.format_exception(type(e), e, e.__traceback__)))
+            raise
         finally:
             try:
                 del latents
                         torch.cuda.ipc_collect()
                     except Exception:
                         pass
+            except Exception as e:
+                print(f"[DEBUG] Limpeza GPU no finally falhou: {e}")
             try:
+                self.finalize(keep_paths=[])
+            except Exception as e:
+                print(f"[DEBUG] finalize() no finally falhou: {e}")
 print("Criando instância do VideoService. O carregamento do modelo começará agora...")
 video_generation_service = VideoService()