Test4

Paused

App Files Files Community

euiiiia commited on Oct 17

Commit

cb6fb4c

verified ·

1 Parent(s): a9beee3

Update api/ltx_server_refactored.py

Browse files

Files changed (1) hide show

api/ltx_server_refactored.py +199 -58

api/ltx_server_refactored.py CHANGED Viewed

@@ -19,7 +19,6 @@ import subprocess
 from pathlib import Path
 from typing import List, Dict, Optional, Tuple, Union
 # --- Configurações de Logging e Avisos ---
 warnings.filterwarnings("ignore", category=UserWarning)
 warnings.filterwarnings("ignore", category=FutureWarning)
@@ -92,12 +91,55 @@ from ltx_video.schedulers.rf import RectifiedFlowScheduler
 from ltx_video.utils.skip_layer_strategy import SkipLayerStrategy
 import ltx_video.pipelines.crf_compressor as crf_compressor
-from ltx_video.models.autoencoders.vae_encode import (
-    get_vae_size_scale_factor,
-    latent_to_pixel_coords,
-    vae_decode,
-    vae_encode,
-)
 def create_latent_upsampler(latent_upsampler_model_path: str, device: str):
     latent_upsampler = LatentUpsampler.from_pretrained(latent_upsampler_model_path)
@@ -174,22 +216,6 @@ def create_ltx_video_pipeline(
         transformer = transformer.to(torch.bfloat16)
     text_encoder = text_encoder.to(torch.bfloat16)
-    # --- Ajuste global de precisão coerente ---
-    if precision in ["float8_e4m3fn", "bfloat16"]:
-        dtype_target = torch.bfloat16
-    elif precision == "mixed_precision":
-        dtype_target = torch.float16
-    else:
-        dtype_target = torch.float32
-    for m in [vae, transformer, text_encoder]:
-        m.to(dtype_target)
-    # garante coerência geral da pipeline
-    pipeline_dtype = dtype_target
     # Use submodels for the pipeline
     submodel_dict = {
         "transformer": transformer,
@@ -206,14 +232,38 @@ def create_ltx_video_pipeline(
     }
     pipeline = LTXVideoPipeline(**submodel_dict)
     pipeline = pipeline.to(device)
-    pipeline.to(dtype=pipeline_dtype)
     return pipeline
 # ==============================================================================
 # 3. CLASSE PRINCIPAL DO SERVIÇO DE VÍDEO
@@ -230,7 +280,7 @@ class VideoService:
         t0 = time.perf_counter()
         print("[INFO] Inicializando VideoService...")
         self.device = "cuda" if torch.cuda.is_available() else "cpu"
-        self.config = self._load_config("ltxv-13b-0.9.8-dev-fp8.yaml")
         self.pipeline, self.latent_upsampler = self._load_models_from_hub()
         self._move_models_to_device()
@@ -241,10 +291,6 @@ class VideoService:
             device=self.device,
             autocast_dtype=self.runtime_autocast_dtype
         )
-        self._apply_precision_policy()
-        #print(f"[DEBUG] runtime_autocast_dtype = {getattr(self, 'runtime_autocast_dtype', None)}")
         self._tmp_dirs = set()
         RESULTS_DIR.mkdir(exist_ok=True)
         print(f"[INFO] VideoService pronto. Tempo de inicialização: {time.perf_counter()-t0:.2f}s")
@@ -253,6 +299,30 @@ class VideoService:
     # --- Métodos Públicos (API do Serviço) ---
     # --------------------------------------------------------------------------
     def generate_low_resolution(
         self, prompt: str, negative_prompt: str,
         height: int, width: int, duration_secs: float,
@@ -263,45 +333,120 @@ class VideoService:
         Gera um vídeo de baixa resolução e retorna os caminhos para o vídeo e os latentes.
         """
         used_seed = random.randint(0, 2**32 - 1) if seed is None else int(seed)
         actual_num_frames = int(duration_secs * DEFAULT_FPS)
-        #= self._calculate_downscaled_dims(height, width)
         first_pass_kwargs = {
             "prompt": prompt,
             "negative_prompt": negative_prompt,
-            "height": height,
-            "width": width,
-            "num_frames": max(24, actual_num_frames)+1,
             "frame_rate": int(DEFAULT_FPS),
             "generator": torch.Generator(device=self.device).manual_seed(used_seed),
             "output_type": "latent",
             "conditioning_items": conditioning_items,
             "guidance_scale": float(guidance_scale),
-            "is_video": True,
-            "vae_per_channel_normalize": True,
             **(self.config.get("first_pass", {}))
         }
         temp_dir = tempfile.mkdtemp(prefix="ltxv_low_")
         self._register_tmp_dir(temp_dir)
-        latents = self.pipeline(**first_pass_kwargs).images
-        pixel_tensor = vae_manager_singleton.decode(latents.clone(), decode_timestep=float(self.config.get("decode_timestep", 0.05)))
-        video_path = self._save_video_from_tensor(pixel_tensor, "low_res_video", used_seed, temp_dir)
-        latents_path = self._save_latents_to_disk(latents, "latents_low_res", used_seed)
         try:
             return video_path, latents_path, used_seed
         finally:
             self._finalize()
     def encode_latents_to_mp4(self, latents_path: str, fps: int = int(DEFAULT_FPS)) -> str:
         """Decodifica um tensor de latentes salvo e o salva como um vídeo MP4."""
         latents = torch.load(latents_path)
         temp_dir = tempfile.mkdtemp(prefix="ltxv_enc_")
         self._register_tmp_dir(temp_dir)
         try:
             chunks = self._split_latents_with_overlap(latents)
             pixel_chunks = []
@@ -419,7 +564,12 @@ class VideoService:
         # Filtro AdaIN para manter consistência de cor/estilo com o vídeo de baixa resolução
         return adain_filter_latent(latents=upsampled_latents_normalized, reference_latents=latents)
     def _calculate_downscaled_dims(self, height: int, width: int) -> Tuple[int, int]:
         """Calcula as dimensões para o primeiro passo (baixa resolução)."""
         height_padded = ((height - 1) // 8 + 1) * 8
@@ -498,17 +648,8 @@ class VideoService:
         if torch.backends.mps.is_available():
             torch.mps.manual_seed(seed)
-    def _apply_precision_policy(self):
-        precision = str(self.config.get("precision", "bfloat16")).lower()
-        if precision in ["float8_e4m3fn", "bfloat16"]: self.runtime_autocast_dtype = torch.bfloat16
-        elif precision == "mixed_precision": self.runtime_autocast_dtype = torch.float16
-        else: self.runtime_autocast_dtype = torch.float32
 # ==============================================================================
 # 4. INSTANCIAÇÃO E PONTO DE ENTRADA (Exemplo)
 # ==============================================================================
-video_generation_service = VideoService()
-print("Instância do VideoService pronta para uso.")

 from pathlib import Path
 from typing import List, Dict, Optional, Tuple, Union
 # --- Configurações de Logging e Avisos ---
 warnings.filterwarnings("ignore", category=UserWarning)
 warnings.filterwarnings("ignore", category=FutureWarning)
 from ltx_video.utils.skip_layer_strategy import SkipLayerStrategy
 import ltx_video.pipelines.crf_compressor as crf_compressor
+def load_image_to_tensor_with_resize_and_crop(
+    image_input: Union[str, Image.Image],
+    target_height: int = 512,
+    target_width: int = 768,
+    just_crop: bool = False,
+) -> torch.Tensor:
+    """Load and process an image into a tensor.
+    Args:
+        image_input: Either a file path (str) or a PIL Image object
+        target_height: Desired height of output tensor
+        target_width: Desired width of output tensor
+        just_crop: If True, only crop the image to the target size without resizing
+    """
+    if isinstance(image_input, str):
+        image = Image.open(image_input).convert("RGB")
+    elif isinstance(image_input, Image.Image):
+        image = image_input
+    else:
+        raise ValueError("image_input must be either a file path or a PIL Image object")
+    input_width, input_height = image.size
+    aspect_ratio_target = target_width / target_height
+    aspect_ratio_frame = input_width / input_height
+    if aspect_ratio_frame > aspect_ratio_target:
+        new_width = int(input_height * aspect_ratio_target)
+        new_height = input_height
+        x_start = (input_width - new_width) // 2
+        y_start = 0
+    else:
+        new_width = input_width
+        new_height = int(input_width / aspect_ratio_target)
+        x_start = 0
+        y_start = (input_height - new_height) // 2
+    image = image.crop((x_start, y_start, x_start + new_width, y_start + new_height))
+    if not just_crop:
+        image = image.resize((target_width, target_height))
+    image = np.array(image)
+    image = cv2.GaussianBlur(image, (3, 3), 0)
+    frame_tensor = torch.from_numpy(image).float()
+    frame_tensor = crf_compressor.compress(frame_tensor / 255.0) * 255.0
+    frame_tensor = frame_tensor.permute(2, 0, 1)
+    frame_tensor = (frame_tensor / 127.5) - 1.0
+    # Create 5D tensor: (batch_size=1, channels=3, num_frames=1, height, width)
+    return frame_tensor.unsqueeze(0).unsqueeze(2)
 def create_latent_upsampler(latent_upsampler_model_path: str, device: str):
     latent_upsampler = LatentUpsampler.from_pretrained(latent_upsampler_model_path)
         transformer = transformer.to(torch.bfloat16)
     text_encoder = text_encoder.to(torch.bfloat16)
     # Use submodels for the pipeline
     submodel_dict = {
         "transformer": transformer,
     }
     pipeline = LTXVideoPipeline(**submodel_dict)
     pipeline = pipeline.to(device)
     return pipeline
+# ==============================================================================
+# 2. FUNÇÕES AUXILIARES DE PROCESSAMENTO
+# ==============================================================================
+def calculate_padding(orig_h: int, orig_w: int, target_h: int, target_w: int) -> Tuple[int, int, int, int]:
+    """Calcula o preenchimento para centralizar uma imagem em uma nova dimensão."""
+    pad_h = target_h - orig_h
+    pad_w = target_w - orig_w
+    pad_top = pad_h // 2
+    pad_bottom = pad_h - pad_top
+    pad_left = pad_w // 2
+    pad_right = pad_w - pad_left
+    return (pad_left, pad_right, pad_top, pad_bottom)
+def log_tensor_info(tensor: torch.Tensor, name: str = "Tensor"):
+    """Exibe informações detalhadas sobre um tensor para depuração."""
+    if not isinstance(tensor, torch.Tensor):
+        print(f"\n[INFO] '{name}' não é um tensor.")
+        return
+    print(f"\n--- Tensor Info: {name} ---")
+    print(f"  - Shape:  {tuple(tensor.shape)}")
+    print(f"  - Dtype:  {tensor.dtype}")
+    print(f"  - Device: {tensor.device}")
+    if tensor.numel() > 0:
+        try:
+            print(f"  - Stats:  Min={tensor.min().item():.4f}, Max={tensor.max().item():.4f}, Mean={tensor.mean().item():.4f}")
+        except RuntimeError:
+            print("  - Stats: Não foi possível calcular (ex: tensores bool).")
+    print("-" * 30)
 # ==============================================================================
 # 3. CLASSE PRINCIPAL DO SERVIÇO DE VÍDEO
         t0 = time.perf_counter()
         print("[INFO] Inicializando VideoService...")
         self.device = "cuda" if torch.cuda.is_available() else "cpu"
+        self.config = self._load_config("ltxv-13b-0.9.8-distilled-fp8.yaml")
         self.pipeline, self.latent_upsampler = self._load_models_from_hub()
         self._move_models_to_device()
             device=self.device,
             autocast_dtype=self.runtime_autocast_dtype
         )
         self._tmp_dirs = set()
         RESULTS_DIR.mkdir(exist_ok=True)
         print(f"[INFO] VideoService pronto. Tempo de inicialização: {time.perf_counter()-t0:.2f}s")
     # --- Métodos Públicos (API do Serviço) ---
     # --------------------------------------------------------------------------
+    def _prepare_condition_items(self, items_list: List[Tuple], height: int, width: int, num_frames: int) -> List[ConditioningItem]:
+        """Prepara os tensores de condicionamento a partir de imagens ou tensores."""
+        if not items_list:
+            return []
+        height, width = self._calculate_downscaled_dims(height, width)
+        height_padded = ((height - 1) // 8 + 1) * 8
+        width_padded = ((width - 1) // 8 + 1) * 8
+        padding_values = calculate_padding(height, width, height_padded, width_padded)
+        conditioning_items = []
+        for media, frame_idx, weight in items_list:
+            if isinstance(media, str):
+                tensor = self._prepare_conditioning_tensor_from_path(media, height, width, padding_values)
+            else: # Assume que é um tensor
+                tensor = media.to(self.device, dtype=self.runtime_autocast_dtype)
+            # Garante que o frame de condicionamento esteja dentro dos limites do vídeo
+            safe_frame_idx = max(0, min(int(frame_idx), num_frames - 1))
+            conditioning_items.append(ConditioningItem(tensor, safe_frame_idx, float(weight)))
+        return conditioning_items
     def generate_low_resolution(
         self, prompt: str, negative_prompt: str,
         height: int, width: int, duration_secs: float,
         Gera um vídeo de baixa resolução e retorna os caminhos para o vídeo e os latentes.
         """
         used_seed = random.randint(0, 2**32 - 1) if seed is None else int(seed)
+        self._seed_everething(used_seed)
         actual_num_frames = int(duration_secs * DEFAULT_FPS)
+        downscaled_height, downscaled_width = self._calculate_downscaled_dims(height, width)
         first_pass_kwargs = {
             "prompt": prompt,
             "negative_prompt": negative_prompt,
+            "height": downscaled_height,
+            "width": downscaled_width,
+            "num_frames": max(3, actual_num_frames//8)+1,
             "frame_rate": int(DEFAULT_FPS),
             "generator": torch.Generator(device=self.device).manual_seed(used_seed),
             "output_type": "latent",
             "conditioning_items": conditioning_items,
             "guidance_scale": float(guidance_scale),
             **(self.config.get("first_pass", {}))
         }
         temp_dir = tempfile.mkdtemp(prefix="ltxv_low_")
         self._register_tmp_dir(temp_dir)
         try:
+            with torch.autocast(device_type=self.device.split(':')[0], dtype=self.runtime_autocast_dtype, enabled=(self.device == 'cuda')):
+                latents = self.pipeline(**first_pass_kwargs).images
+                pixel_tensor = vae_manager_singleton.decode(latents.clone(), decode_timestep=float(self.config.get("decode_timestep", 0.05)))
+                video_path = self._save_video_from_tensor(pixel_tensor, "low_res_video", used_seed, temp_dir)
+                latents_path = self._save_latents_to_disk(latents, "latents_low_res", used_seed)
             return video_path, latents_path, used_seed
+        except Exception as e:
+            print(f"[ERROR] Falha na geração de baixa resolução: {e}")
+            traceback.print_exc()
+            raise
+        finally:
+            self._finalize()
+    def generate_upscale_denoise(
+        self, latents_path: str, prompt: str,
+        negative_prompt: str, height: int, width: int,
+        num_frames: float, guidance_scale: float, seed: Optional[int] = None,
+        conditioning_items: Optional[List[ConditioningItem]] = None
+    ) -> Tuple[str, str]:
+        """
+        Aplica upscale, AdaIN e Denoise em latentes de baixa resolução usando um processo de chunking.
+        """
+        used_seed = random.randint(0, 2**32 - 1) if seed is None else int(seed)
+        self._seed_everething(used_seed)
+        temp_dir = tempfile.mkdtemp(prefix="ltxv_up_")
+        self._register_tmp_dir(temp_dir)
+        try:
+            latents_low = torch.load(latents_path).to(self.device)
+            with torch.autocast(device_type=self.device.split(':')[0], dtype=self.runtime_autocast_dtype, enabled=(self.device == 'cuda')):
+                upsampled_latents = latents_low #self._upsample_and_filter_latents(latents_low)
+                #chunks = self._split_latents_with_overlap(upsampled_latents)
+                #refined_chunks = []
+                #for chunk in chunks:
+                    #if chunk.shape[2] <= 1: continue  # Pula chunks inválidos
+                chunk = upsampled_latents
+                second_pass_height = chunk.shape[3] * self.pipeline.vae_scale_factor
+                second_pass_width = chunk.shape[4] * self.pipeline.vae_scale_factor
+                second_pass_kwargs = {
+                    "prompt": prompt,
+                    "negative_prompt": negative_prompt,
+                    "height": second_pass_height,
+                    "width": second_pass_width,
+                    "frame_rate": int(DEFAULT_FPS),
+                    "num_frames": num_frames,
+                    "latents": chunk,  # O tensor completo é passado aqui
+                    "guidance_scale": float(guidance_scale),
+                    "output_type": "latent",
+                    "generator": torch.Generator(device=self.device).manual_seed(used_seed),
+                    "conditioning_items": conditioning_items,
+                    **(self.config.get("second_pass", {}))
+                }
+                refined_chunk = self.pipeline(**second_pass_kwargs).images
+                #refined_chunks.append(refined_chunk)
+            del latents_low; torch.cuda.empty_cache()
+            final_latents = refined_chunk #self._merge_chunks_with_overlap(refined_chunks)
+            #if LTXV_DEBUG:
+            #    log_tensor_info(final_latents, "Latentes Upscaled/Refinados Finais")
+            latents_path = self._save_latents_to_disk(final_latents, "latents_refined", used_seed)
+            pixel_tensor = vae_manager_singleton.decode(final_latents, decode_timestep=float(self.config.get("decode_timestep", 0.05)))
+            video_path = self._save_video_from_tensor(pixel_tensor, "refined_video", used_seed, temp_dir)
+            return video_path, latents_path
+        except Exception as e:
+            print(f"[ERROR] Falha no processo de upscale e denoise: {e}")
+            traceback.print_exc()
+            raise
         finally:
             self._finalize()
     def encode_latents_to_mp4(self, latents_path: str, fps: int = int(DEFAULT_FPS)) -> str:
         """Decodifica um tensor de latentes salvo e o salva como um vídeo MP4."""
         latents = torch.load(latents_path)
         temp_dir = tempfile.mkdtemp(prefix="ltxv_enc_")
         self._register_tmp_dir(temp_dir)
+        seed = random.randint(0, 99999) # Seed apenas para nome do arquivo
         try:
             chunks = self._split_latents_with_overlap(latents)
             pixel_chunks = []
         # Filtro AdaIN para manter consistência de cor/estilo com o vídeo de baixa resolução
         return adain_filter_latent(latents=upsampled_latents_normalized, reference_latents=latents)
+    def _prepare_conditioning_tensor_from_path(self, filepath: str, height: int, width: int, padding: Tuple) -> torch.Tensor:
+        """Carrega uma imagem, redimensiona, aplica padding e move para o dispositivo."""
+        tensor = load_image_to_tensor_with_resize_and_crop(filepath, height, width)
+        tensor = F.pad(tensor, padding)
+        return tensor.to(self.device, dtype=self.runtime_autocast_dtype)
     def _calculate_downscaled_dims(self, height: int, width: int) -> Tuple[int, int]:
         """Calcula as dimensões para o primeiro passo (baixa resolução)."""
         height_padded = ((height - 1) // 8 + 1) * 8
         if torch.backends.mps.is_available():
             torch.mps.manual_seed(seed)
 # ==============================================================================
 # 4. INSTANCIAÇÃO E PONTO DE ENTRADA (Exemplo)
 # ==============================================================================
+print("Criando instância do VideoService. O carregamento do modelo começará agora...")
+video_generation_service = VideoService()