euiiiia commited on
Commit
1c0198d
·
verified ·
1 Parent(s): 1497820

Update api/ltx_server_refactored.py

Browse files
Files changed (1) hide show
  1. api/ltx_server_refactored.py +750 -340
api/ltx_server_refactored.py CHANGED
@@ -1,68 +1,201 @@
1
- # ltx_server_refactored.py — VideoService (Modular Version with Simple Overlap Chunking)
2
 
3
- # --- 0. WARNINGS E AMBIENTE ---
 
 
 
 
 
 
 
 
 
 
4
  import warnings
 
 
 
 
 
 
 
 
5
  warnings.filterwarnings("ignore", category=UserWarning)
6
  warnings.filterwarnings("ignore", category=FutureWarning)
7
- warnings.filterwarnings("ignore", message=".*")
8
- from huggingface_hub import logging
9
- logging.set_verbosity_error()
10
- logging.set_verbosity_warning()
11
- logging.set_verbosity_info()
12
- logging.set_verbosity_debug()
13
- LTXV_DEBUG=1
14
- LTXV_FRAME_LOG_EVERY=8
15
- import os, subprocess, shlex, tempfile
16
  import torch
17
- import json
18
  import numpy as np
19
- import random
20
- import os
21
- import shlex
22
- import yaml
23
- from typing import List, Dict
24
- from pathlib import Path
25
- import imageio
26
  from PIL import Image
27
- import tempfile
28
- from huggingface_hub import hf_hub_download
29
- import sys
30
- import subprocess
31
- import gc
32
- import shutil
33
- import contextlib
34
- import time
35
- import traceback
36
  from einops import rearrange
37
- import torch.nn.functional as F
 
 
38
  from managers.vae_manager import vae_manager_singleton
39
  from tools.video_encode_tool import video_encode_tool_singleton
 
 
 
 
 
 
40
  DEPS_DIR = Path("/data")
41
  LTX_VIDEO_REPO_DIR = DEPS_DIR / "LTX-Video"
 
 
42
 
43
- # (Todas as funções de setup, helpers e inicialização da classe permanecem inalteradas)
44
- # ... (run_setup, add_deps_to_path, _query_gpu_processes_via_nvml, etc.)
45
- def run_setup():
 
 
 
46
  setup_script_path = "setup.py"
47
  if not os.path.exists(setup_script_path):
48
  print("[DEBUG] 'setup.py' não encontrado. Pulando clonagem de dependências.")
49
  return
 
 
50
  try:
51
- print("[DEBUG] Executando setup.py para dependências...")
52
- subprocess.run([sys.executable, setup_script_path], check=True)
53
- print("[DEBUG] Setup concluído com sucesso.")
54
  except subprocess.CalledProcessError as e:
55
- print(f"[DEBUG] ERRO no setup.py (code {e.returncode}). Abortando.")
56
  sys.exit(1)
 
 
 
 
 
 
 
 
 
 
57
  if not LTX_VIDEO_REPO_DIR.exists():
58
- print(f"[DEBUG] Repositório não encontrado em {LTX_VIDEO_REPO_DIR}. Rodando setup...")
59
- run_setup()
60
- def add_deps_to_path():
61
- repo_path = str(LTX_VIDEO_REPO_DIR.resolve())
62
- if str(LTX_VIDEO_REPO_DIR.resolve()) not in sys.path:
63
- sys.path.insert(0, repo_path)
64
- print(f"[DEBUG] Repo adicionado ao sys.path: {repo_path}")
65
- def calculate_padding(orig_h, orig_w, target_h, target_w):
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
66
  pad_h = target_h - orig_h
67
  pad_w = target_w - orig_w
68
  pad_top = pad_h // 2
@@ -70,368 +203,645 @@ def calculate_padding(orig_h, orig_w, target_h, target_w):
70
  pad_left = pad_w // 2
71
  pad_right = pad_w - pad_left
72
  return (pad_left, pad_right, pad_top, pad_bottom)
73
- def log_tensor_info(tensor, name="Tensor"):
 
 
74
  if not isinstance(tensor, torch.Tensor):
75
- print(f"\n[INFO] '{name}' não é tensor.")
76
  return
77
- print(f"\n--- Tensor: {name} ---")
78
- print(f" - Shape: {tuple(tensor.shape)}")
79
- print(f" - Dtype: {tensor.dtype}")
80
  print(f" - Device: {tensor.device}")
81
  if tensor.numel() > 0:
82
  try:
83
- print(f" - Min: {tensor.min().item():.4f} Max: {tensor.max().item():.4f} Mean: {tensor.mean().item():.4f}")
84
- except Exception:
85
- pass
86
- print("------------------------------------------\n")
87
-
88
- add_deps_to_path()
89
- from ltx_video.pipelines.pipeline_ltx_video import ConditioningItem, LTXMultiScalePipeline
90
- from ltx_video.utils.skip_layer_strategy import SkipLayerStrategy
91
- from ltx_video.models.autoencoders.vae_encode import un_normalize_latents, normalize_latents
92
- from ltx_video.pipelines.pipeline_ltx_video import adain_filter_latent
93
- from api.ltx.inference import (
94
- create_ltx_video_pipeline,
95
- create_latent_upsampler,
96
- load_image_to_tensor_with_resize_and_crop,
97
- seed_everething,
98
- )
99
-
100
-
101
- def load_image_to_tensor_with_resize_and_crop(
102
- image_input: Union[str, Image.Image],
103
- target_height: int = 512,
104
- target_width: int = 768,
105
- just_crop: bool = False,
106
- ) -> torch.Tensor:
107
- """Load and process an image into a tensor.
108
-
109
- Args:
110
- image_input: Either a file path (str) or a PIL Image object
111
- target_height: Desired height of output tensor
112
- target_width: Desired width of output tensor
113
- just_crop: If True, only crop the image to the target size without resizing
114
- """
115
- if isinstance(image_input, str):
116
- image = Image.open(image_input).convert("RGB")
117
- elif isinstance(image_input, Image.Image):
118
- image = image_input
119
- else:
120
- raise ValueError("image_input must be either a file path or a PIL Image object")
121
-
122
- input_width, input_height = image.size
123
- aspect_ratio_target = target_width / target_height
124
- aspect_ratio_frame = input_width / input_height
125
- if aspect_ratio_frame > aspect_ratio_target:
126
- new_width = int(input_height * aspect_ratio_target)
127
- new_height = input_height
128
- x_start = (input_width - new_width) // 2
129
- y_start = 0
130
- else:
131
- new_width = input_width
132
- new_height = int(input_width / aspect_ratio_target)
133
- x_start = 0
134
- y_start = (input_height - new_height) // 2
135
-
136
- image = image.crop((x_start, y_start, x_start + new_width, y_start + new_height))
137
- if not just_crop:
138
- image = image.resize((target_width, target_height))
139
-
140
- image = np.array(image)
141
- image = cv2.GaussianBlur(image, (3, 3), 0)
142
- frame_tensor = torch.from_numpy(image).float()
143
- frame_tensor = crf_compressor.compress(frame_tensor / 255.0) * 255.0
144
- frame_tensor = frame_tensor.permute(2, 0, 1)
145
- frame_tensor = (frame_tensor / 127.5) - 1.0
146
- # Create 5D tensor: (batch_size=1, channels=3, num_frames=1, height, width)
147
- return frame_tensor.unsqueeze(0).unsqueeze(2)
148
-
149
 
 
 
 
150
 
151
  class VideoService:
 
 
 
 
 
152
  def __init__(self):
 
153
  t0 = time.perf_counter()
154
- print("[DEBUG] Inicializando VideoService...")
155
  self.device = "cuda" if torch.cuda.is_available() else "cpu"
156
- self.config = self._load_config()
157
- self.pipeline, self.latent_upsampler = self._load_models()
158
- self.pipeline.to(self.device)
159
- if self.latent_upsampler:
160
- self.latent_upsampler.to(self.device)
161
- self._apply_precision_policy()
162
  vae_manager_singleton.attach_pipeline(
163
  self.pipeline,
164
  device=self.device,
165
  autocast_dtype=self.runtime_autocast_dtype
166
  )
167
  self._tmp_dirs = set()
168
- print(f"[DEBUG] VideoService pronto. boot_time={time.perf_counter()-t0:.3f}s")
 
169
 
170
- def _load_config(self):
171
- base = LTX_VIDEO_REPO_DIR / "configs"
172
- config_path = base / "ltxv-13b-0.9.8-distilled-fp8.yaml"
173
- with open(config_path, "r") as file:
174
- return yaml.safe_load(file)
175
 
176
- def finalize(self, keep_paths=None, extra_paths=None, clear_gpu=True):
177
- print("[DEBUG] Finalize: iniciando limpeza...")
178
- keep = set(keep_paths or []); extras = set(extra_paths or [])
179
- gc.collect()
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
180
  try:
181
- if clear_gpu and torch.cuda.is_available():
182
- torch.cuda.empty_cache()
183
- try:
184
- torch.cuda.ipc_collect()
185
- except Exception:
186
- pass
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
187
  except Exception as e:
188
- print(f"[DEBUG] Finalize: limpeza GPU falhou: {e}")
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
189
  try:
190
- self._log_gpu_memory("Após finalize")
 
 
 
 
 
 
 
 
 
 
 
 
191
  except Exception as e:
192
- print(f"[DEBUG] Log GPU pós-finalize falhou: {e}")
 
 
 
 
193
 
194
- def _load_models(self):
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
195
  t0 = time.perf_counter()
196
  LTX_REPO = "Lightricks/LTX-Video"
197
- print("[DEBUG] Baixando checkpoint principal...")
198
- distilled_model_path = hf_hub_download(
199
- repo_id=LTX_REPO,
200
- filename=self.config["checkpoint_path"],
201
- local_dir=os.getenv("HF_HOME"),
202
- cache_dir=os.getenv("HF_HOME_CACHE"),
203
- token=os.getenv("HF_TOKEN"),
204
- )
205
- self.config["checkpoint_path"] = distilled_model_path
206
- print(f"[DEBUG] Checkpoint em: {distilled_model_path}")
207
-
208
- print("[DEBUG] Baixando upscaler espacial...")
209
- spatial_upscaler_path = hf_hub_download(
210
- repo_id=LTX_REPO,
211
- filename=self.config["spatial_upscaler_model_path"],
212
- local_dir=os.getenv("HF_HOME"),
213
- cache_dir=os.getenv("HF_HOME_CACHE"),
214
  token=os.getenv("HF_TOKEN")
215
  )
216
- self.config["spatial_upscaler_model_path"] = spatial_upscaler_path
217
- print(f"[DEBUG] Upscaler em: {spatial_upscaler_path}")
218
 
219
- print("[DEBUG] Construindo pipeline...")
220
  pipeline = create_ltx_video_pipeline(
221
  ckpt_path=self.config["checkpoint_path"],
222
  precision=self.config["precision"],
223
  text_encoder_model_name_or_path=self.config["text_encoder_model_name_or_path"],
224
  sampler=self.config["sampler"],
225
- device="cpu",
226
- enhance_prompt=False,
227
- prompt_enhancer_image_caption_model_name_or_path=self.config["prompt_enhancer_image_caption_model_name_or_path"],
228
- prompt_enhancer_llm_model_name_or_path=self.config["prompt_enhancer_llm_model_name_or_path"],
229
  )
230
- print("[DEBUG] Pipeline pronto.")
231
 
232
  latent_upsampler = None
233
  if self.config.get("spatial_upscaler_model_path"):
234
- print("[DEBUG] Construindo latent_upsampler...")
 
 
 
 
 
 
 
235
  latent_upsampler = create_latent_upsampler(self.config["spatial_upscaler_model_path"], device="cpu")
236
- print("[DEBUG] Upsampler pronto.")
237
- print(f"[DEBUG] _load_models() tempo total={time.perf_counter()-t0:.3f}s")
 
238
  return pipeline, latent_upsampler
 
 
 
 
 
 
 
239
 
240
- def _apply_precision_policy(self):
 
241
  prec = str(self.config.get("precision", "")).lower()
242
- self.runtime_autocast_dtype = torch.float32
243
  if prec in ["float8_e4m3fn", "bfloat16"]:
244
- self.runtime_autocast_dtype = torch.bfloat16
245
  elif prec == "mixed_precision":
246
- self.runtime_autocast_dtype = torch.float16
247
-
248
- def _register_tmp_dir(self, d: str):
249
- if d and os.path.isdir(d):
250
- self._tmp_dirs.add(d); print(f"[DEBUG] Registrado tmp dir: {d}")
251
 
252
  @torch.no_grad()
253
- def _upsample_latents_internal(self, latents: torch.Tensor) -> torch.Tensor:
254
- try:
255
- if not self.latent_upsampler:
256
- raise ValueError("Latent Upsampler não está carregado.")
257
- latents_unnormalized = un_normalize_latents(latents, self.pipeline.vae, vae_per_channel_normalize=True)
258
- upsampled_latents = self.latent_upsampler(latents_unnormalized)
259
- return normalize_latents(upsampled_latents, self.pipeline.vae, vae_per_channel_normalize=True)
260
- except Exception as e:
261
- pass
262
- finally:
263
- torch.cuda.empty_cache()
264
- torch.cuda.ipc_collect()
265
- self.finalize(keep_paths=[])
266
-
267
- def _prepare_conditioning_tensor(self, filepath, height, width, padding_values):
268
- tensor = load_image_to_tensor_with_resize_and_crop(filepath, height, width)
269
- tensor = torch.nn.functional.pad(tensor, padding_values)
270
- return tensor.to(self.device, dtype=self.runtime_autocast_dtype)
271
-
272
-
273
- def _save_and_log_video(self, pixel_tensor, base_filename, fps, temp_dir, results_dir, used_seed, progress_callback=None):
274
- output_path = os.path.join(temp_dir, f"{base_filename}_{used_seed}.mp4")
275
- video_encode_tool_singleton.save_video_from_tensor(
276
- pixel_tensor, output_path, fps=fps, progress_callback=progress_callback
277
- )
278
- final_path = os.path.join(results_dir, f"{base_filename}_{used_seed}.mp4")
279
- shutil.move(output_path, final_path)
280
- print(f"[DEBUG] Vídeo salvo em: {final_path}")
281
- return final_path
282
-
283
- # ==============================================================================
284
- # --- FUNÇÕES MODULARES COM A LÓGICA DE CHUNKING SIMPLIFICADA ---
285
- # ==============================================================================
286
-
287
- def _prepare_condition_items(self, items_list: List[Tuple], height: int, width: int, num_frames: int) -> List[ConditioningItem]:
288
- """Prepara os tensores de condicionamento a partir de imagens ou tensores."""
289
- if not items_list:
290
- return []
291
-
292
- height, width = self._calculate_downscaled_dims(height, width)
293
-
294
- height_padded = ((height - 1) // 8 + 1) * 8
295
- width_padded = ((width - 1) // 8 + 1) * 8
296
- padding_values = calculate_padding(height, width, height_padded, width_padded)
297
 
298
- conditioning_items = []
299
- for media, frame_idx, weight in items_list:
300
- if isinstance(media, str):
301
- tensor = self._prepare_conditioning_tensor_from_path(media, height, width, padding_values)
302
- else: # Assume que é um tensor
303
- tensor = media.to(self.device, dtype=self.runtime_autocast_dtype)
304
-
305
- # Garante que o frame de condicionamento esteja dentro dos limites do vídeo
306
- safe_frame_idx = max(0, min(int(frame_idx), num_frames - 1))
307
- conditioning_items.append(ConditioningItem(tensor, safe_frame_idx, float(weight)))
308
-
309
- return conditioning_items
310
 
 
 
 
311
  def _prepare_conditioning_tensor_from_path(self, filepath: str, height: int, width: int, padding: Tuple) -> torch.Tensor:
312
  """Carrega uma imagem, redimensiona, aplica padding e move para o dispositivo."""
313
- tensor = load_image_to_tensor_with_resize_and_crop(filepath, height, width)
314
  tensor = F.pad(tensor, padding)
315
  return tensor.to(self.device, dtype=self.runtime_autocast_dtype)
316
-
317
 
318
- def generate_low(self, prompt, negative_prompt, height, width, duration, guidance_scale, seed, conditioning_items=None):
319
- used_seed = random.randint(0, 2**32 - 1) if seed is None else int(seed)
320
- seed_everething(used_seed)
321
- FPS = 24.0
322
- actual_num_frames = max(9, int(round((round(duration * FPS) - 1) / 8.0) * 8 + 1))
323
  height_padded = ((height - 1) // 8 + 1) * 8
324
  width_padded = ((width - 1) // 8 + 1) * 8
325
- temp_dir = tempfile.mkdtemp(prefix="ltxv_low_"); self._register_tmp_dir(temp_dir)
326
- results_dir = "/app/output"; os.makedirs(results_dir, exist_ok=True)
327
  downscale_factor = self.config.get("downscale_factor", 0.6666666)
328
  vae_scale_factor = self.pipeline.vae_scale_factor
329
- x_width = int(width_padded * downscale_factor)
330
- downscaled_width = x_width - (x_width % vae_scale_factor)
331
- x_height = int(height_padded * downscale_factor)
332
- downscaled_height = x_height - (x_height % vae_scale_factor)
333
- first_pass_kwargs = {
334
- "prompt": prompt, "negative_prompt": negative_prompt, "height": downscaled_height, "width": downscaled_width,
335
- "num_frames": actual_num_frames, "frame_rate": int(FPS), "generator": torch.Generator(device=self.device).manual_seed(used_seed),
336
- "output_type": "latent", "conditioning_items": conditioning_items, "guidance_scale": float(guidance_scale),
337
- **(self.config.get("first_pass", {}))
338
- }
339
- try:
340
- with torch.autocast(device_type="cuda", dtype=self.runtime_autocast_dtype, enabled=self.device == 'cuda'):
341
- latents = self.pipeline(**first_pass_kwargs).images
342
- pixel_tensor = vae_manager_singleton.decode(latents.clone(), decode_timestep=float(self.config.get("decode_timestep", 0.05)))
343
- video_path = self._save_and_log_video(pixel_tensor, "low_res_video", FPS, temp_dir, results_dir, used_seed)
344
- latents_cpu = latents.detach().to("cpu")
345
- tensor_path = os.path.join(results_dir, f"latents_low_res_{used_seed}.pt")
346
- torch.save(latents_cpu, tensor_path)
347
- return video_path, tensor_path, used_seed
348
 
349
- except Exception as e:
350
- pass
351
- finally:
352
- torch.cuda.empty_cache()
353
- torch.cuda.ipc_collect()
354
- self.finalize(keep_paths=[])
355
-
356
- def generate_upscale_denoise(self, latents_path, prompt, negative_prompt, guidance_scale, seed):
357
- used_seed = random.randint(0, 2**32 - 1) if seed is None else int(seed)
358
- seed_everething(used_seed)
359
- temp_dir = tempfile.mkdtemp(prefix="ltxv_up_"); self._register_tmp_dir(temp_dir)
360
- results_dir = "/app/output"; os.makedirs(results_dir, exist_ok=True)
361
- latents_low = torch.load(latents_path).to(self.device)
362
- with torch.autocast(device_type="cuda", dtype=self.runtime_autocast_dtype, enabled=self.device == 'cuda'):
363
- upsampled_latents = self._upsample_latents_internal(latents_low)
364
- upsampled_latents = adain_filter_latent(latents=upsampled_latents, reference_latents=latents_low)
365
- del latents_low; torch.cuda.empty_cache()
366
-
367
- # --- LÓGICA DE DIVISÃO SIMPLES COM OVERLAP ---
368
- total_frames = upsampled_latents.shape[2]
369
- # Garante que mid_point seja pelo menos 1 para evitar um segundo chunk vazio se houver poucos frames
370
- mid_point = max(1, total_frames // 2)
371
- chunk1 = upsampled_latents[:, :, :mid_point, :, :]
372
- # O segundo chunk começa um frame antes para criar o overlap
373
- chunk2 = upsampled_latents[:, :, mid_point - 1:, :, :]
374
-
375
- final_latents_list = []
376
- for i, chunk in enumerate([chunk1, chunk2]):
377
- if chunk.shape[2] <= 1: continue # Pula chunks inválidos ou vazios
378
- second_pass_height = chunk.shape[3] * self.pipeline.vae_scale_factor
379
- second_pass_width = chunk.shape[4] * self.pipeline.vae_scale_factor
380
- second_pass_kwargs = {
381
- "prompt": prompt, "negative_prompt": negative_prompt, "height": second_pass_height, "width": second_pass_width,
382
- "num_frames": chunk.shape[2], "latents": chunk, "guidance_scale": float(guidance_scale),
383
- "output_type": "latent", "generator": torch.Generator(device=self.device).manual_seed(used_seed),
384
- **(self.config.get("second_pass", {}))
385
- }
386
- refined_chunk = self.pipeline(**second_pass_kwargs).images
387
- # Remove o overlap do primeiro chunk refinado antes de juntar
388
- if i == 0:
389
- final_latents_list.append(refined_chunk[:, :, :-1, :, :])
390
- else:
391
- final_latents_list.append(refined_chunk)
392
-
393
- final_latents = torch.cat(final_latents_list, dim=2)
394
- log_tensor_info(final_latents, "Latentes Upscaled/Refinados Finais")
395
-
396
- latents_cpu = final_latents.detach().to("cpu")
397
- tensor_path = os.path.join(results_dir, f"latents_refined_{used_seed}.pt")
398
- torch.save(latents_cpu, tensor_path)
399
- pixel_tensor = vae_manager_singleton.decode(final_latents, decode_timestep=float(self.config.get("decode_timestep", 0.05)))
400
- video_path = self._save_and_log_video(pixel_tensor, "refined_video", 24.0, temp_dir, results_dir, used_seed)
401
- return video_path, tensor_path
402
 
 
 
 
 
 
403
 
 
 
 
 
 
 
404
 
405
- def encode_mp4(self, latents_path: str, fps: int = 24):
406
- latents = torch.load(latents_path)
407
- seed = random.randint(0, 99999)
408
- temp_dir = tempfile.mkdtemp(prefix="ltxv_enc_"); self._register_tmp_dir(temp_dir)
409
- results_dir = "/app/output"; os.makedirs(results_dir, exist_ok=True)
 
 
 
 
 
 
410
 
411
- # --- LÓGICA DE DIVISÃO SIMPLES COM OVERLAP ---
412
- total_frames = latents.shape[2]
413
- mid_point = max(1, total_frames // 2)
414
- chunk1_latents = latents[:, :, :mid_point, :, :]
415
- chunk2_latents = latents[:, :, mid_point - 1:, :, :]
 
 
 
 
 
 
 
 
 
 
416
 
417
- video_parts = []
418
- pixel_chunks_to_concat = []
419
- with torch.autocast(device_type="cuda", dtype=self.runtime_autocast_dtype, enabled=self.device == 'cuda'):
420
- for i, chunk in enumerate([chunk1_latents, chunk2_latents]):
421
- if chunk.shape[2] == 0: continue
422
- pixel_chunk = vae_manager_singleton.decode(chunk.to(self.device), decode_timestep=float(self.config.get("decode_timestep", 0.05)))
423
- # Remove o overlap do primeiro chunk de pixels
424
- if i == 0:
425
- pixel_chunks_to_concat.append(pixel_chunk[:, :, :-1, :, :])
426
- else:
427
- pixel_chunks_to_concat.append(pixel_chunk)
428
 
429
- final_pixel_tensor = torch.cat(pixel_chunks_to_concat, dim=2)
430
- final_video_path = self._save_and_log_video(final_pixel_tensor, f"final_concatenated_{seed}", fps, temp_dir, results_dir, seed)
431
- return final_video_path
 
 
 
 
 
 
432
 
 
 
 
 
 
 
 
 
 
 
 
433
 
434
- # --- INSTANCIAÇÃO DO SERVIÇO ---
435
  print("Criando instância do VideoService. O carregamento do modelo começará agora...")
436
  video_generation_service = VideoService()
437
  print("Instância do VideoService pronta para uso.")
 
1
+ # ltx_server_clean_refactor.py — VideoService (Modular Version with Simple Overlap Chunking)
2
 
3
+ # ==============================================================================
4
+ # 0. CONFIGURAÇÃO DE AMBIENTE E IMPORTAÇÕES
5
+ # ==============================================================================
6
+ import os
7
+ import sys
8
+ import gc
9
+ import yaml
10
+ import time
11
+ import json
12
+ import random
13
+ import shutil
14
  import warnings
15
+ import tempfile
16
+ import traceback
17
+ import subprocess
18
+ from pathlib import Path
19
+ from typing import List, Dict, Optional, Tuple, Union
20
+ import cv2
21
+
22
+ # --- Configurações de Logging e Avisos ---
23
  warnings.filterwarnings("ignore", category=UserWarning)
24
  warnings.filterwarnings("ignore", category=FutureWarning)
25
+ from huggingface_hub import logging as hf_logging
26
+ hf_logging.set_verbosity_error()
27
+
28
+ # --- Importações de Bibliotecas de ML/Processamento ---
 
 
 
 
 
29
  import torch
30
+ import torch.nn.functional as F
31
  import numpy as np
 
 
 
 
 
 
 
32
  from PIL import Image
 
 
 
 
 
 
 
 
 
33
  from einops import rearrange
34
+ from huggingface_hub import hf_hub_download
35
+ from safetensors import safe_open
36
+
37
  from managers.vae_manager import vae_manager_singleton
38
  from tools.video_encode_tool import video_encode_tool_singleton
39
+
40
+ from api.aduc_ltx_latent_patch import LTXLatentConditioningPatch, PatchedConditioningItem
41
+
42
+ # --- Constantes Globais ---
43
+ LTXV_DEBUG = True # Mude para False para desativar logs de debug
44
+ LTXV_FRAME_LOG_EVERY = 8
45
  DEPS_DIR = Path("/data")
46
  LTX_VIDEO_REPO_DIR = DEPS_DIR / "LTX-Video"
47
+ RESULTS_DIR = Path("/app/output")
48
+ DEFAULT_FPS = 24.0
49
 
50
+ # ==============================================================================
51
+ # 1. SETUP E FUNÇÕES AUXILIARES DE AMBIENTE
52
+ # ==============================================================================
53
+
54
+ def _run_setup_script():
55
+ """Executa o script setup.py se o repositório LTX-Video não existir."""
56
  setup_script_path = "setup.py"
57
  if not os.path.exists(setup_script_path):
58
  print("[DEBUG] 'setup.py' não encontrado. Pulando clonagem de dependências.")
59
  return
60
+
61
+ print(f"[DEBUG] Repositório não encontrado em {LTX_VIDEO_REPO_DIR}. Executando setup.py...")
62
  try:
63
+ subprocess.run([sys.executable, setup_script_path], check=True, capture_output=True, text=True)
64
+ print("[DEBUG] Script 'setup.py' concluído com sucesso.")
 
65
  except subprocess.CalledProcessError as e:
66
+ print(f"[ERROR] Falha ao executar 'setup.py' (código {e.returncode}).\nOutput:\n{e.stdout}\n{e.stderr}")
67
  sys.exit(1)
68
+
69
+ def add_deps_to_path(repo_path: Path):
70
+ """Adiciona o diretório do repositório ao sys.path para importações locais."""
71
+ resolved_path = str(repo_path.resolve())
72
+ if resolved_path not in sys.path:
73
+ sys.path.insert(0, resolved_path)
74
+ if LTXV_DEBUG:
75
+ print(f"[DEBUG] Adicionado ao sys.path: {resolved_path}")
76
+
77
+ # --- Execução da configuração inicial ---
78
  if not LTX_VIDEO_REPO_DIR.exists():
79
+ _run_setup_script()
80
+ add_deps_to_path(LTX_VIDEO_REPO_DIR)
81
+
82
+ # --- Importações Dependentes do Path Adicionado ---
83
+ from ltx_video.models.autoencoders.vae_encode import un_normalize_latents, normalize_latents
84
+ from ltx_video.pipelines.pipeline_ltx_video import adain_filter_latent
85
+ from ltx_video.models.autoencoders.latent_upsampler import LatentUpsampler
86
+ from ltx_video.pipelines.pipeline_ltx_video import ConditioningItem, LTXVideoPipeline
87
+ from transformers import T5EncoderModel, T5Tokenizer, AutoModelForCausalLM, AutoProcessor, AutoTokenizer
88
+ from ltx_video.models.autoencoders.causal_video_autoencoder import CausalVideoAutoencoder
89
+ from ltx_video.models.transformers.symmetric_patchifier import SymmetricPatchifier
90
+ from ltx_video.models.transformers.transformer3d import Transformer3DModel
91
+ from ltx_video.schedulers.rf import RectifiedFlowScheduler
92
+ from ltx_video.utils.skip_layer_strategy import SkipLayerStrategy
93
+ import ltx_video.pipelines.crf_compressor as crf_compressor
94
+
95
+
96
+ def create_latent_upsampler(latent_upsampler_model_path: str, device: str):
97
+ latent_upsampler = LatentUpsampler.from_pretrained(latent_upsampler_model_path)
98
+ latent_upsampler.to(device)
99
+ latent_upsampler.eval()
100
+ return latent_upsampler
101
+
102
+ def create_ltx_video_pipeline(
103
+ ckpt_path: str,
104
+ precision: str,
105
+ text_encoder_model_name_or_path: str,
106
+ sampler: Optional[str] = None,
107
+ device: Optional[str] = None,
108
+ enhance_prompt: bool = False,
109
+ prompt_enhancer_image_caption_model_name_or_path: Optional[str] = None,
110
+ prompt_enhancer_llm_model_name_or_path: Optional[str] = None,
111
+ ) -> LTXVideoPipeline:
112
+ ckpt_path = Path(ckpt_path)
113
+ assert os.path.exists(
114
+ ckpt_path
115
+ ), f"Ckpt path provided (--ckpt_path) {ckpt_path} does not exist"
116
+
117
+ with safe_open(ckpt_path, framework="pt") as f:
118
+ metadata = f.metadata()
119
+ config_str = metadata.get("config")
120
+ configs = json.loads(config_str)
121
+ allowed_inference_steps = configs.get("allowed_inference_steps", None)
122
+
123
+ vae = CausalVideoAutoencoder.from_pretrained(ckpt_path)
124
+ transformer = Transformer3DModel.from_pretrained(ckpt_path)
125
+
126
+ # Use constructor if sampler is specified, otherwise use from_pretrained
127
+ if sampler == "from_checkpoint" or not sampler:
128
+ scheduler = RectifiedFlowScheduler.from_pretrained(ckpt_path)
129
+ else:
130
+ scheduler = RectifiedFlowScheduler(
131
+ sampler=("Uniform" if sampler.lower() == "uniform" else "LinearQuadratic")
132
+ )
133
+
134
+ text_encoder = T5EncoderModel.from_pretrained(
135
+ text_encoder_model_name_or_path, subfolder="text_encoder"
136
+ )
137
+ patchifier = SymmetricPatchifier(patch_size=1)
138
+ tokenizer = T5Tokenizer.from_pretrained(
139
+ text_encoder_model_name_or_path, subfolder="tokenizer"
140
+ )
141
+
142
+ transformer = transformer.to(device)
143
+ vae = vae.to(device)
144
+ text_encoder = text_encoder.to(device)
145
+
146
+ if enhance_prompt:
147
+ prompt_enhancer_image_caption_model = AutoModelForCausalLM.from_pretrained(
148
+ prompt_enhancer_image_caption_model_name_or_path, trust_remote_code=True
149
+ )
150
+ prompt_enhancer_image_caption_processor = AutoProcessor.from_pretrained(
151
+ prompt_enhancer_image_caption_model_name_or_path, trust_remote_code=True
152
+ )
153
+ prompt_enhancer_llm_model = AutoModelForCausalLM.from_pretrained(
154
+ prompt_enhancer_llm_model_name_or_path,
155
+ torch_dtype="bfloat16",
156
+ )
157
+ prompt_enhancer_llm_tokenizer = AutoTokenizer.from_pretrained(
158
+ prompt_enhancer_llm_model_name_or_path,
159
+ )
160
+ else:
161
+ prompt_enhancer_image_caption_model = None
162
+ prompt_enhancer_image_caption_processor = None
163
+ prompt_enhancer_llm_model = None
164
+ prompt_enhancer_llm_tokenizer = None
165
+
166
+ vae = vae.to(torch.bfloat16)
167
+ if precision == "bfloat16" and transformer.dtype != torch.bfloat16:
168
+ transformer = transformer.to(torch.bfloat16)
169
+ text_encoder = text_encoder.to(torch.bfloat16)
170
+
171
+ # Use submodels for the pipeline
172
+ submodel_dict = {
173
+ "transformer": transformer,
174
+ "patchifier": patchifier,
175
+ "text_encoder": text_encoder,
176
+ "tokenizer": tokenizer,
177
+ "scheduler": scheduler,
178
+ "vae": vae,
179
+ "prompt_enhancer_image_caption_model": prompt_enhancer_image_caption_model,
180
+ "prompt_enhancer_image_caption_processor": prompt_enhancer_image_caption_processor,
181
+ "prompt_enhancer_llm_model": prompt_enhancer_llm_model,
182
+ "prompt_enhancer_llm_tokenizer": prompt_enhancer_llm_tokenizer,
183
+ "allowed_inference_steps": allowed_inference_steps,
184
+ }
185
+
186
+ pipeline = LTXVideoPipeline(**submodel_dict)
187
+
188
+ LTXLatentConditioningPatch.apply()
189
+
190
+ pipeline = pipeline.to(device)
191
+ return pipeline
192
+
193
+ # ==============================================================================
194
+ # 2. FUNÇÕES AUXILIARES DE PROCESSAMENTO
195
+ # ==============================================================================
196
+
197
+ def calculate_padding(orig_h: int, orig_w: int, target_h: int, target_w: int) -> Tuple[int, int, int, int]:
198
+ """Calcula o preenchimento para centralizar uma imagem em uma nova dimensão."""
199
  pad_h = target_h - orig_h
200
  pad_w = target_w - orig_w
201
  pad_top = pad_h // 2
 
203
  pad_left = pad_w // 2
204
  pad_right = pad_w - pad_left
205
  return (pad_left, pad_right, pad_top, pad_bottom)
206
+
207
+ def log_tensor_info(tensor: torch.Tensor, name: str = "Tensor"):
208
+ """Exibe informações detalhadas sobre um tensor para depuração."""
209
  if not isinstance(tensor, torch.Tensor):
210
+ print(f"\n[INFO] '{name}' não é um tensor.")
211
  return
212
+ print(f"\n--- Tensor Info: {name} ---")
213
+ print(f" - Shape: {tuple(tensor.shape)}")
214
+ print(f" - Dtype: {tensor.dtype}")
215
  print(f" - Device: {tensor.device}")
216
  if tensor.numel() > 0:
217
  try:
218
+ print(f" - Stats: Min={tensor.min().item():.4f}, Max={tensor.max().item():.4f}, Mean={tensor.mean().item():.4f}")
219
+ except RuntimeError:
220
+ print(" - Stats: Não foi possível calcular (ex: tensores bool).")
221
+ print("-" * 30)
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
222
 
223
+ # ==============================================================================
224
+ # 3. CLASSE PRINCIPAL DO SERVIÇO DE VÍDEO
225
+ # ==============================================================================
226
 
227
  class VideoService:
228
+ """
229
+ Serviço encapsulado para gerar vídeos usando a pipeline LTX-Video.
230
+ Gerencia o carregamento de modelos, pré-processamento, geração em múltiplos
231
+ passos (baixa resolução, upscale com denoise) e pós-processamento.
232
+ """
233
  def __init__(self):
234
+ """Inicializa o serviço, carregando configurações e modelos."""
235
  t0 = time.perf_counter()
236
+ print("[INFO] Inicializando VideoService...")
237
  self.device = "cuda" if torch.cuda.is_available() else "cpu"
238
+ self.config = self._load_config("ltxv-13b-0.9.8-distilled-fp8.yaml")
239
+
240
+ self.pipeline, self.latent_upsampler = self._load_models_from_hub()
241
+ self._move_models_to_device()
242
+
243
+ self.runtime_autocast_dtype = self._get_precision_dtype()
244
  vae_manager_singleton.attach_pipeline(
245
  self.pipeline,
246
  device=self.device,
247
  autocast_dtype=self.runtime_autocast_dtype
248
  )
249
  self._tmp_dirs = set()
250
+ RESULTS_DIR.mkdir(exist_ok=True)
251
+ print(f"[INFO] VideoService pronto. Tempo de inicialização: {time.perf_counter()-t0:.2f}s")
252
 
253
+ # --------------------------------------------------------------------------
254
+ # --- Métodos Públicos (API do Serviço) ---
255
+ # --------------------------------------------------------------------------
 
 
256
 
257
+ def _load_image_to_tensor_with_resize_and_crop(
258
+ self,
259
+ image_input: Union[str, Image.Image],
260
+ target_height: int = 512,
261
+ target_width: int = 768,
262
+ just_crop: bool = False,
263
+ ) -> torch.Tensor:
264
+ """Load and process an image into a tensor.
265
+
266
+ Args:
267
+ image_input: Either a file path (str) or a PIL Image object
268
+ target_height: Desired height of output tensor
269
+ target_width: Desired width of output tensor
270
+ just_crop: If True, only crop the image to the target size without resizing
271
+ """
272
+ if isinstance(image_input, str):
273
+ image = Image.open(image_input).convert("RGB")
274
+ elif isinstance(image_input, Image.Image):
275
+ image = image_input
276
+ else:
277
+ raise ValueError("image_input must be either a file path or a PIL Image object")
278
+
279
+ input_width, input_height = image.size
280
+ aspect_ratio_target = target_width / target_height
281
+ aspect_ratio_frame = input_width / input_height
282
+ if aspect_ratio_frame > aspect_ratio_target:
283
+ new_width = int(input_height * aspect_ratio_target)
284
+ new_height = input_height
285
+ x_start = (input_width - new_width) // 2
286
+ y_start = 0
287
+ else:
288
+ new_width = input_width
289
+ new_height = int(input_width / aspect_ratio_target)
290
+ x_start = 0
291
+ y_start = (input_height - new_height) // 2
292
+
293
+ image = image.crop((x_start, y_start, x_start + new_width, y_start + new_height))
294
+ if not just_crop:
295
+ image = image.resize((target_width, target_height))
296
+
297
+ image = np.array(image)
298
+ image = cv2.GaussianBlur(image, (3, 3), 0)
299
+ frame_tensor = torch.from_numpy(image).float()
300
+ frame_tensor = crf_compressor.compress(frame_tensor / 255.0) * 255.0
301
+ frame_tensor = frame_tensor.permute(2, 0, 1)
302
+ frame_tensor = (frame_tensor / 127.5) - 1.0
303
+ # Create 5D tensor: (batch_size=1, channels=3, num_frames=1, height, width)
304
+ return frame_tensor.unsqueeze(0).unsqueeze(2)
305
+
306
+
307
+
308
+ def generate_low_resolution1(self, prompt: str, negative_prompt: str, height: int, width: int, duration_secs: float, guidance_scale: float, seed: Optional[int] = None, conditioning_items: Optional[List[PatchedConditioningItem]] = None) -> Tuple[str, str, int]:
309
+ """
310
+ Gera um vídeo de baixa resolução e retorna os caminhos para o vídeo e os latentes.
311
+ """
312
+ used_seed = random.randint(0, 2**32 - 1) if seed is None else int(seed)
313
+ #self._seed_everething(used_seed)
314
+
315
+ actual_num_frames = max(9, int(round((round(duration_secs * DEFAULT_FPS) - 1) / 8.0) * 8 + 1))
316
+
317
+ downscaled_height, downscaled_width = self._calculate_downscaled_dims(height, width)
318
+
319
+ first_pass_kwargs = {
320
+ "prompt": prompt, "negative_prompt": negative_prompt, "height": downscaled_height,
321
+ "width": downscaled_width, "num_frames": actual_num_frames, "frame_rate": int(DEFAULT_FPS),
322
+ "generator": torch.Generator(device=self.device).manual_seed(used_seed),
323
+ "output_type": "latent", "conditioning_items": conditioning_items,
324
+ "guidance_scale": float(guidance_scale), **(self.config.get("first_pass", {}))
325
+ }
326
+
327
+ temp_dir = tempfile.mkdtemp(prefix="ltxv_low_")
328
+ self._register_tmp_dir(temp_dir)
329
+
330
  try:
331
+ with torch.autocast(device_type=self.device.split(':')[0], dtype=self.runtime_autocast_dtype, enabled=(self.device == 'cuda')):
332
+ latents = self.pipeline(**first_pass_kwargs).images
333
+ #pixel_tensor = vae_manager_singleton.decode(latents.clone(), decode_timestep=float(self.config.get("decode_timestep", 0.05)))
334
+ #video_path = self._save_video_from_tensor(pixel_tensor, "low_res_video", used_seed, temp_dir)
335
+ latents_path = self._save_latents_to_disk(latents, "latents_low_res", used_seed)
336
+
337
+ log_tensor_info(latents, "first_pass_lat" )
338
+ self._finalize()
339
+
340
+ final_video_path, final_latents_path = self.generate_upscale_denoise(
341
+ latents_path=latents_path,
342
+ prompt=prompt,
343
+ negative_prompt=negative_prompt,
344
+ guidance_scale=guidance_scale,
345
+ seed=used_seed
346
+ )
347
+
348
+ print(f"[SUCCESS] PASSO 2 concluído. Vídeo final em: {final_video_path}")
349
+
350
+ return final_video_path, final_latents_path, used_seed
351
+
352
+ except Exception as e:
353
+ print(f"[ERROR] Falha na geração de baixa resolução: {e}")
354
+ traceback.print_exc()
355
+ raise
356
+ finally:
357
+ self._finalize()
358
+
359
+ # Em api/ltx_server_refactored.py, dentro da classe VideoService
360
+
361
+
362
+ # ADICIONE A FUNÇÃO ABAIXO
363
+ @torch.no_grad()
364
+ def _image_to_latents(self, image_input: Union[str, Image.Image], height: int, width: int) -> torch.Tensor:
365
+ """
366
+ Converte uma imagem (caminho ou PIL) em um tensor de latentes 5D.
367
+ Retorna: Tensor na forma [1, C_lat, 1, H_lat, W_lat]
368
+ """
369
+ print(f"[DEBUG] Codificando imagem para latente ({height}x{width})...")
370
+ # 1. Carrega a imagem e a transforma em um tensor de pixel 5D
371
+ pixel_tensor = self._load_image_to_tensor_with_resize_and_crop(
372
+ image_input, target_height=height, target_width=width
373
+ )
374
+ pixel_tensor_gpu = pixel_tensor.to(self.device, dtype=self.pipeline.vae.dtype)
375
+
376
+ # 2. Usa a VAE para codificar o tensor de pixel em um tensor de latentes
377
+ with torch.autocast(device_type=self.device.split(':')[0], dtype=self.runtime_autocast_dtype, enabled=(self.device == 'cuda')):
378
+ # O vae_encode da pipeline já lida com tensores 5D
379
+ latents = self.pipeline.vae.encode(pixel_tensor_gpu).latent_dist.sample()
380
+
381
+ # 3. Aplica o fator de escala (importante para consistência)
382
+ if hasattr(self.pipeline.vae.config, "scaling_factor"):
383
+ latents = latents * self.pipeline.vae.config.scaling_factor
384
+
385
+ print(f"[DEBUG] Imagem codificada para latente com shape: {latents.shape}")
386
+ return latents
387
+
388
+ def _prepare_condition_items(self, items_list: List[Tuple], height: int, width: int) -> List[PatchedConditioningItem]:
389
+ """
390
+ Prepara os itens de condicionamento.
391
+ Recebe uma lista [Imagem, frame, peso], converte a Imagem para LATENTE
392
+ e cria uma lista de PatchedConditioningItem com o tensor em `latents`.
393
+ """
394
+ if not items_list:
395
+ return []
396
+
397
+ conditioning_items = []
398
+ for media_input, frame_idx, weight in items_list:
399
+ # 1. USA A NOVA FUNÇÃO PARA OBTER O TENSOR DE LATENTES DIRETAMENTE
400
+ latent_tensor = self._image_to_latents(media_input, height, width)
401
+
402
+ safe_frame_idx = int(frame_idx)
403
+
404
+ # 2. CRIA O PatchedConditioningItem COM O CAMPO `latents` PREENCHIDO
405
+ item = PatchedConditioningItem(
406
+ media_frame_number=safe_frame_idx,
407
+ conditioning_strength=float(weight),
408
+ media_item=None, # Importante: media_item é None
409
+ latents=latent_tensor # O latente pré-calculado vai aqui!
410
+ )
411
+ conditioning_items.append(item)
412
+
413
+ print(f"[INFO] Preparados {len(conditioning_items)} itens de condicionamento com latentes pré-codificados.")
414
+ return conditioning_items
415
+
416
+ def generate_upscale_denoise(self, latents_path: str, prompt: str, negative_prompt: str, guidance_scale: float, seed: Optional[int] = None) -> Tuple[str, str]:
417
+ """
418
+ Aplica upscale, AdaIN e Denoise em latentes de baixa resolução usando um processo de chunking.
419
+ """
420
+ used_seed = random.randint(0, 2**32 - 1) if seed is None else int(seed)
421
+ #seed_everything(used_seed)
422
+
423
+ temp_dir = tempfile.mkdtemp(prefix="ltxv_up_")
424
+ self._register_tmp_dir(temp_dir)
425
+
426
+ try:
427
+ latents_low = torch.load(latents_path).to(self.device)
428
+ with torch.autocast(device_type=self.device.split(':')[0], dtype=self.runtime_autocast_dtype, enabled=(self.device == 'cuda')):
429
+ upsampled_latents = self._upsample_and_filter_latents(latents_low)
430
+ del latents_low; torch.cuda.empty_cache()
431
+
432
+ chunks = self._split_latents_with_overlap(upsampled_latents)
433
+ refined_chunks = []
434
+
435
+ for chunk in chunks:
436
+ if chunk.shape[2] <= 1: continue # Pula chunks inválidos
437
+
438
+ second_pass_height = chunk.shape[3] * self.pipeline.vae_scale_factor
439
+ second_pass_width = chunk.shape[4] * self.pipeline.vae_scale_factor
440
+
441
+ second_pass_kwargs = {
442
+ "prompt": prompt, "negative_prompt": negative_prompt, "height": second_pass_height,
443
+ "width": second_pass_width, "num_frames": chunk.shape[2], "latents": chunk,
444
+ "guidance_scale": float(guidance_scale), "output_type": "latent",
445
+ "generator": torch.Generator(device=self.device).manual_seed(used_seed),
446
+ **(self.config.get("second_pass", {}))
447
+ }
448
+ refined_chunk = self.pipeline(**second_pass_kwargs).images
449
+ refined_chunks.append(refined_chunk)
450
+
451
+ log_tensor_info(refined_chunk, "refined_chunk" )
452
+
453
+ final_latents = self._merge_chunks_with_overlap(refined_chunks)
454
+
455
+ if LTXV_DEBUG:
456
+ log_tensor_info(final_latents, "Latentes Upscaled/Refinados Finais")
457
+
458
+ latents_path = self._save_latents_to_disk(final_latents, "latents_refined", used_seed)
459
+ pixel_tensor = vae_manager_singleton.decode(final_latents, decode_timestep=float(self.config.get("decode_timestep", 0.05)))
460
+ video_path = self._save_video_from_tensor(pixel_tensor, "refined_video", used_seed, temp_dir)
461
+
462
+ return video_path, latents_path
463
+
464
  except Exception as e:
465
+ print(f"[ERROR] Falha no processo de upscale e denoise: {e}")
466
+ traceback.print_exc()
467
+ raise
468
+ finally:
469
+ self._finalize()
470
+
471
+ def generate_low_resolution(
472
+ self,
473
+ prompt: str,
474
+ negative_prompt: str,
475
+ height: int,
476
+ width: int,
477
+ duration_secs: float,
478
+ guidance_scale: float,
479
+ seed: Optional[int] = None,
480
+ conditioning_items: Optional[List[PatchedConditioningItem]] = None
481
+ ) -> Tuple[str, str, int]:
482
+ """
483
+ ETAPA 1: Gera um vídeo e latentes em resolução base a partir de um prompt e
484
+ condicionamentos opcionais.
485
+ """
486
+ print("[INFO] Iniciando ETAPA 1: Geração de Baixa Resolução...")
487
+
488
+ # --- Configuração de Seed e Diretórios ---
489
+ used_seed = random.randint(0, 2**32 - 1) if seed is None else int(seed)
490
+ #seed_everything(used_seed)
491
+ print(f" - Usando Seed: {used_seed}")
492
+
493
+ temp_dir = tempfile.mkdtemp(prefix="ltxv_low_")
494
+ self._register_tmp_dir(temp_dir)
495
+ results_dir = "/app/output"
496
+ os.makedirs(results_dir, exist_ok=True)
497
+
498
+ # --- Cálculo de Dimensões e Frames ---
499
+ actual_num_frames = int(round(duration_secs * DEFAULT_FPS))
500
+ downscaled_height = height
501
+ downscaled_width = width
502
+ #self._calculate_downscaled_dims(height, width)
503
+
504
+
505
+ print(f" - Frames: {actual_num_frames}, Duração: {duration_secs}s")
506
+ print(f" - Dimensões de Saída: {downscaled_height}x{downscaled_width}")
507
+
508
+ # --- Execução da Pipeline ---
509
+ with torch.autocast(device_type=self.device.split(':')[0], dtype=self.runtime_autocast_dtype, enabled=(self.device == 'cuda')):
510
+
511
+ first_pass_kwargs = {
512
+ "prompt": prompt,
513
+ "negative_prompt": negative_prompt,
514
+ "height": downscaled_height,
515
+ "width": downscaled_width,
516
+ "num_frames": (actual_num_frames//8)+1,
517
+ "frame_rate": int(DEFAULT_FPS),
518
+ "generator": torch.Generator(device=self.device).manual_seed(used_seed),
519
+ "output_type": "latent",
520
+ "conditioning_items": conditioning_items,
521
+ "guidance_scale": float(guidance_scale),
522
+ **(self.config.get("first_pass", {}))
523
+ }
524
+
525
+ print(" - Enviando para a pipeline LTX...")
526
+ latents = self.pipeline(**first_pass_kwargs).images
527
+ print(f" - Latentes gerados com shape: {latents.shape}")
528
+
529
+ # Decodifica os latentes para pixels para criar o vídeo de preview
530
+ #pixel_tensor = vae_manager_singleton.decode(latents.clone(), decode_timestep=float(self.config.get("decode_timestep", 0.05)))
531
+
532
+ # Salva os artefatos de saída (vídeo e tensor de latentes)
533
+ #video_path = self._save_video_from_tensor(pixel_tensor, "low_res_video", used_seed, temp_dir)
534
+ tensor_path = self._save_latents_to_disk(latents, "latents_low_res", used_seed)
535
+
536
+ self._finalize()
537
+
538
+ final_video_path, final_latents_path = self.refine_texture_only(
539
+ latents_path=tensor_path,
540
+ prompt=prompt,
541
+ negative_prompt=negative_prompt,
542
+ guidance_scale=guidance_scale,
543
+ seed=used_seed,
544
+ conditioning_items=conditioning_items,
545
+ )
546
+
547
+ # --- Limpeza ---
548
+ self._finalize()
549
+
550
+ print("[SUCCESS] ETAPA 1 Concluída.")
551
+ return final_video_path, final_latents_path, used_seed
552
+
553
+
554
+ def refine_texture_only(
555
+ self,
556
+ latents_path: str,
557
+ prompt: str,
558
+ negative_prompt: str,
559
+ guidance_scale: float,
560
+ seed: Optional[int] = None,
561
+ conditioning_items: Optional[List[PatchedConditioningItem]] = None
562
+ ) -> Tuple[str, str]:
563
+ """
564
+ ETAPA 2: Refina a textura dos latentes existentes SEM alterar sua resolução
565
+ e SEM dividi-los em pedaços. O tensor inteiro é processado de uma só vez para
566
+ garantir máxima consistência temporal.
567
+ """
568
+ print("[INFO] Iniciando ETAPA 2: Refinamento de Textura...")
569
+
570
+ # --- Configuração de Seed e Diretórios ---
571
+ used_seed = random.randint(0, 2**32 - 1) if seed is None else int(seed)
572
+ #seed_everything(used_seed)
573
+ print(f" - Usando Seed (consistente com Etapa 1): {used_seed}")
574
+
575
+ temp_dir = tempfile.mkdtemp(prefix="ltxv_refine_single_")
576
+ self._register_tmp_dir(temp_dir)
577
+
578
+ # --- Carregamento dos Latentes ---
579
+ latents_to_refine = torch.load(latents_path).to(self.device)
580
+ print(f" - Shape dos latentes de entrada: {latents_to_refine.shape}")
581
+
582
+ if conditioning_items:
583
+ print(f" - Usando {len(conditioning_items)} item(ns) de condicionamento para o refinamento.")
584
+
585
+ # --- Execução da Pipeline ---
586
+ with torch.autocast(device_type=self.device.split(':')[0], dtype=self.runtime_autocast_dtype, enabled=(self.device == 'cuda')):
587
+
588
+ # As dimensões são as mesmas do tensor de entrada
589
+ refine_height = latents_to_refine.shape[3] * self.pipeline.vae_scale_factor
590
+ refine_width = latents_to_refine.shape[4] * self.pipeline.vae_scale_factor
591
+
592
+ second_pass_kwargs = {
593
+ "prompt": prompt,
594
+ "negative_prompt": negative_prompt,
595
+ "height": refine_height,
596
+ "width": refine_width,
597
+ "frame_rate": int(DEFAULT_FPS),
598
+ "num_frames": latents_to_refine.shape[2],
599
+ "latents": latents_to_refine, # O tensor completo é passado aqui
600
+ "guidance_scale": float(guidance_scale),
601
+ "output_type": "latent",
602
+ "generator": torch.Generator(device=self.device).manual_seed(used_seed),
603
+ "conditioning_items": conditioning_items,
604
+ **(self.config.get("second_pass", {}))
605
+ }
606
+
607
+ print(" - Enviando tensor completo para a pipeline de refinamento...")
608
+ final_latents = self.pipeline(**second_pass_kwargs).images
609
+ print(f" - Latentes refinados com shape: {final_latents.shape}")
610
+
611
+ # Decodifica os latentes refinados para pixels
612
+ pixel_tensor = vae_manager_singleton.decode(final_latents, decode_timestep=float(self.config.get("decode_timestep", 0.05)))
613
+
614
+ # Salva os artefatos de saída
615
+ video_path_out = self._save_video_from_tensor(pixel_tensor, "refined_video_single_pass", used_seed, temp_dir)
616
+ latents_path_out = self._save_latents_to_disk(final_latents, "latents_refined_single_pass", used_seed)
617
+
618
+ # --- Limpeza ---
619
+ # Libera os tensores da memória da GPU antes de finalizar.
620
+ del latents_to_refine
621
+ if 'final_latents' in locals():
622
+ del final_latents
623
+ if 'pixel_tensor' in locals():
624
+ del pixel_tensor
625
+ self._finalize()
626
+
627
+ print("[SUCCESS] ETAPA 2 Concluída.")
628
+ return video_path_out, latents_path_out
629
+
630
+
631
+ def encode_latents_to_mp4(self, latents_path: str, fps: int = int(DEFAULT_FPS)) -> str:
632
+ """Decodifica um tensor de latentes salvo e o salva como um vídeo MP4."""
633
+ latents = torch.load(latents_path)
634
+ temp_dir = tempfile.mkdtemp(prefix="ltxv_enc_")
635
+ self._register_tmp_dir(temp_dir)
636
+ seed = random.randint(0, 99999) # Seed apenas para nome do arquivo
637
+
638
  try:
639
+ chunks = self._split_latents_with_overlap(latents)
640
+ pixel_chunks = []
641
+
642
+ with torch.autocast(device_type=self.device.split(':')[0], dtype=self.runtime_autocast_dtype, enabled=(self.device == 'cuda')):
643
+ for chunk in chunks:
644
+ if chunk.shape[2] == 0: continue
645
+ pixel_chunk = vae_manager_singleton.decode(chunk.to(self.device), decode_timestep=float(self.config.get("decode_timestep", 0.05)))
646
+ pixel_chunks.append(pixel_chunk)
647
+
648
+ final_pixel_tensor = self._merge_chunks_with_overlap(pixel_chunks)
649
+ final_video_path = self._save_video_from_tensor(final_pixel_tensor, f"final_video_{seed}", seed, temp_dir, fps=fps)
650
+ return final_video_path
651
+
652
  except Exception as e:
653
+ print(f"[ERROR] Falha ao encodar latentes para MP4: {e}")
654
+ traceback.print_exc()
655
+ raise
656
+ finally:
657
+ self._finalize()
658
 
659
+ # --------------------------------------------------------------------------
660
+ # --- Métodos Internos e Auxiliares ---
661
+ # --------------------------------------------------------------------------
662
+
663
+ def _finalize(self):
664
+ """Limpa a memória da GPU e os diretórios temporários."""
665
+ if LTXV_DEBUG:
666
+ print("[DEBUG] Finalize: iniciando limpeza...")
667
+
668
+ gc.collect()
669
+ if torch.cuda.is_available():
670
+ torch.cuda.empty_cache()
671
+ torch.cuda.ipc_collect()
672
+
673
+ # Limpa todos os diretórios temporários registrados
674
+ for d in list(self._tmp_dirs):
675
+ shutil.rmtree(d, ignore_errors=True)
676
+ self._tmp_dirs.remove(d)
677
+ if LTXV_DEBUG:
678
+ print(f"[DEBUG] Diretório temporário removido: {d}")
679
+
680
+ def _load_config(self, config_filename: str) -> Dict:
681
+ """Carrega o arquivo de configuração YAML."""
682
+ config_path = LTX_VIDEO_REPO_DIR / "configs" / config_filename
683
+ print(f"[INFO] Carregando configuração de: {config_path}")
684
+ with open(config_path, "r") as file:
685
+ return yaml.safe_load(file)
686
+
687
+ def _load_models_from_hub(self):
688
+ """Baixa e cria as instâncias da pipeline e do upsampler."""
689
  t0 = time.perf_counter()
690
  LTX_REPO = "Lightricks/LTX-Video"
691
+
692
+ print("[INFO] Baixando checkpoint principal...")
693
+ self.config["checkpoint_path"] = hf_hub_download(
694
+ repo_id=LTX_REPO, filename=self.config["checkpoint_path"],
 
 
 
 
 
 
 
 
 
 
 
 
 
695
  token=os.getenv("HF_TOKEN")
696
  )
697
+ print(f"[INFO] Checkpoint principal em: {self.config['checkpoint_path']}")
 
698
 
699
+ print("[INFO] Construindo pipeline...")
700
  pipeline = create_ltx_video_pipeline(
701
  ckpt_path=self.config["checkpoint_path"],
702
  precision=self.config["precision"],
703
  text_encoder_model_name_or_path=self.config["text_encoder_model_name_or_path"],
704
  sampler=self.config["sampler"],
705
+ device="cpu", # Carrega em CPU primeiro
706
+ enhance_prompt=False
 
 
707
  )
708
+ print("[INFO] Pipeline construída.")
709
 
710
  latent_upsampler = None
711
  if self.config.get("spatial_upscaler_model_path"):
712
+ print("[INFO] Baixando upscaler espacial...")
713
+ self.config["spatial_upscaler_model_path"] = hf_hub_download(
714
+ repo_id=LTX_REPO, filename=self.config["spatial_upscaler_model_path"],
715
+ token=os.getenv("HF_TOKEN")
716
+ )
717
+ print(f"[INFO] Upscaler em: {self.config['spatial_upscaler_model_path']}")
718
+
719
+ print("[INFO] Construindo latent_upsampler...")
720
  latent_upsampler = create_latent_upsampler(self.config["spatial_upscaler_model_path"], device="cpu")
721
+ print("[INFO] Latent upsampler construído.")
722
+
723
+ print(f"[INFO] Carregamento de modelos concluído em {time.perf_counter()-t0:.2f}s")
724
  return pipeline, latent_upsampler
725
+
726
+ def _move_models_to_device(self):
727
+ """Move os modelos carregados para o dispositivo de computação (GPU/CPU)."""
728
+ print(f"[INFO] Movendo modelos para o dispositivo: {self.device}")
729
+ self.pipeline.to(self.device)
730
+ if self.latent_upsampler:
731
+ self.latent_upsampler.to(self.device)
732
 
733
+ def _get_precision_dtype(self) -> torch.dtype:
734
+ """Determina o dtype para autocast com base na configuração de precisão."""
735
  prec = str(self.config.get("precision", "")).lower()
 
736
  if prec in ["float8_e4m3fn", "bfloat16"]:
737
+ return torch.bfloat16
738
  elif prec == "mixed_precision":
739
+ return torch.float16
740
+ return torch.float32
 
 
 
741
 
742
  @torch.no_grad()
743
+ def _upsample_and_filter_latents(self, latents: torch.Tensor) -> torch.Tensor:
744
+ """Aplica o upsample espacial e o filtro AdaIN aos latentes."""
745
+ if not self.latent_upsampler:
746
+ raise ValueError("Latent Upsampler não está carregado para a operação de upscale.")
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
747
 
748
+ latents_unnormalized = un_normalize_latents(latents, self.pipeline.vae, vae_per_channel_normalize=True)
749
+ upsampled_latents_unnormalized = self.latent_upsampler(latents_unnormalized)
750
+ upsampled_latents_normalized = normalize_latents(upsampled_latents_unnormalized, self.pipeline.vae, vae_per_channel_normalize=True)
 
 
 
 
 
 
 
 
 
751
 
752
+ # Filtro AdaIN para manter consistência de cor/estilo com o vídeo de baixa resolução
753
+ return adain_filter_latent(latents=upsampled_latents_normalized, reference_latents=latents)
754
+
755
  def _prepare_conditioning_tensor_from_path(self, filepath: str, height: int, width: int, padding: Tuple) -> torch.Tensor:
756
  """Carrega uma imagem, redimensiona, aplica padding e move para o dispositivo."""
757
+ tensor = self._load_image_to_tensor_with_resize_and_crop(filepath, height, width)
758
  tensor = F.pad(tensor, padding)
759
  return tensor.to(self.device, dtype=self.runtime_autocast_dtype)
 
760
 
761
+ def _calculate_downscaled_dims(self, height: int, width: int) -> Tuple[int, int]:
762
+ """Calcula as dimensões para o primeiro passo (baixa resolução)."""
 
 
 
763
  height_padded = ((height - 1) // 8 + 1) * 8
764
  width_padded = ((width - 1) // 8 + 1) * 8
765
+
 
766
  downscale_factor = self.config.get("downscale_factor", 0.6666666)
767
  vae_scale_factor = self.pipeline.vae_scale_factor
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
768
 
769
+ target_w = int(width_padded * downscale_factor)
770
+ downscaled_width = target_w - (target_w % vae_scale_factor)
771
+
772
+ target_h = int(height_padded * downscale_factor)
773
+ downscaled_height = target_h - (target_h % vae_scale_factor)
774
+
775
+ return downscaled_height, downscaled_width
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
776
 
777
+ def _split_latents_with_overlap(self, latents: torch.Tensor, overlap: int = 1) -> List[torch.Tensor]:
778
+ """Divide um tensor de latentes em dois chunks com sobreposição."""
779
+ total_frames = latents.shape[2]
780
+ if total_frames <= overlap:
781
+ return [latents]
782
 
783
+ mid_point = max(overlap, total_frames // 2)
784
+ chunk1 = latents[:, :, :mid_point, :, :]
785
+ # O segundo chunk começa 'overlap' frames antes para criar a sobreposição
786
+ chunk2 = latents[:, :, mid_point - overlap:, :, :]
787
+
788
+ return [c for c in [chunk1, chunk2] if c.shape[2] > 0]
789
 
790
+ def _merge_chunks_with_overlap(self, chunks: List[torch.Tensor], overlap: int = 1) -> torch.Tensor:
791
+ """Junta uma lista de chunks, removendo a sobreposição."""
792
+ if not chunks:
793
+ return torch.empty(0)
794
+ if len(chunks) == 1:
795
+ return chunks[0]
796
+
797
+ # Pega o primeiro chunk sem o frame de sobreposição final
798
+ merged_list = [chunks[0][:, :, :-overlap, :, :]]
799
+ # Adiciona os chunks restantes
800
+ merged_list.extend(chunks[1:])
801
 
802
+ return torch.cat(merged_list, dim=2)
803
+
804
+ def _save_latents_to_disk(self, latents_tensor: torch.Tensor, base_filename: str, seed: int) -> str:
805
+ """Salva um tensor de latentes em um arquivo .pt."""
806
+ latents_cpu = latents_tensor.detach().to("cpu")
807
+ tensor_path = RESULTS_DIR / f"{base_filename}_{seed}.pt"
808
+ torch.save(latents_cpu, tensor_path)
809
+ if LTXV_DEBUG:
810
+ print(f"[DEBUG] Latentes salvos em: {tensor_path}")
811
+ return str(tensor_path)
812
+
813
+ def _save_video_from_tensor(self, pixel_tensor: torch.Tensor, base_filename: str, seed: int, temp_dir: str, fps: int = int(DEFAULT_FPS)) -> str:
814
+ """Salva um tensor de pixels como um arquivo de vídeo MP4."""
815
+ temp_path = os.path.join(temp_dir, f"{base_filename}_{seed}.mp4")
816
+ video_encode_tool_singleton.save_video_from_tensor(pixel_tensor, temp_path, fps=fps)
817
 
818
+ final_path = RESULTS_DIR / f"{base_filename}_{seed}.mp4"
819
+ shutil.move(temp_path, final_path)
820
+ print(f"[INFO] Vídeo final salvo em: {final_path}")
821
+ return str(final_path)
 
 
 
 
 
 
 
822
 
823
+
824
+ def _seed_everething(self, seed: int):
825
+ random.seed(seed)
826
+ np.random.seed(seed)
827
+ torch.manual_seed(seed)
828
+ if torch.cuda.is_available():
829
+ torch.cuda.manual_seed(seed)
830
+ if torch.backends.mps.is_available():
831
+ torch.mps.manual_seed(seed)
832
 
833
+
834
+ def _register_tmp_dir(self, dir_path: str):
835
+ """Registra um diretório temporário para limpeza posterior."""
836
+ if dir_path and os.path.isdir(dir_path):
837
+ self._tmp_dirs.add(dir_path)
838
+ if LTXV_DEBUG:
839
+ print(f"[DEBUG] Diretório temporário registrado: {dir_path}")
840
+
841
+ # ==============================================================================
842
+ # 4. INSTANCIAÇÃO E PONTO DE ENTRADA (Exemplo)
843
+ # ==============================================================================
844
 
 
845
  print("Criando instância do VideoService. O carregamento do modelo começará agora...")
846
  video_generation_service = VideoService()
847
  print("Instância do VideoService pronta para uso.")