Test4

Paused

App Files Files Community

euiiiia commited on Oct 16

Commit

f0f0810

verified ·

1 Parent(s): 4aa7f1b

Update api/ltx_server_refactored.py

Browse files

Files changed (1) hide show

api/ltx_server_refactored.py +239 -19

api/ltx_server_refactored.py CHANGED Viewed

@@ -299,28 +299,53 @@ class VideoService:
     # --- Métodos Públicos (API do Serviço) ---
     # --------------------------------------------------------------------------
-    def _prepare_condition_items(self, items_list: List[Tuple], height: int, width: int, num_frames: int) -> List[ConditioningItem]:
-        """Prepara os tensores de condicionamento a partir de imagens ou tensores."""
         if not items_list:
             return []
-        height, width = self._calculate_downscaled_dims(height, width)
-        height_padded = ((height - 1) // 8 + 1) * 8
-        width_padded = ((width - 1) // 8 + 1) * 8
-        padding_values = calculate_padding(height, width, height_padded, width_padded)
-        conditioning_items = []
         for media, frame_idx, weight in items_list:
             if isinstance(media, str):
-                tensor = self._prepare_conditioning_tensor_from_path(media, height, width, padding_values)
-            else: # Assume que é um tensor
-                tensor = media.to(self.device, dtype=self.runtime_autocast_dtype)
-            # Garante que o frame de condicionamento esteja dentro dos limites do vídeo
-            safe_frame_idx = max(0, min(int(frame_idx), num_frames - 1))
-            conditioning_items.append(ConditioningItem(tensor, safe_frame_idx, float(weight)))
         return conditioning_items
     def generate_low_resolution(
@@ -457,4 +482,199 @@ class VideoService:
                     pixel_chunk = vae_manager_singleton.decode(chunk.to(self.device), decode_timestep=float(self.config.get("decode_timestep", 0.05)))
                     pixel_chunks.append(pixel_chunk)
-            final_pixel_tensor = self._merge_chunks_with_ove

     # --- Métodos Públicos (API do Serviço) ---
     # --------------------------------------------------------------------------
+    def _prepare_condition_items(
+        self,
+        items_list: List[Tuple[Union[str, Image.Image, torch.Tensor], int, float]],
+        height: int,
+        width: int,
+        num_frames: int,
+    ) -> List[ConditioningItem]:
+        """
+        Prepara ConditioningItem a partir de paths, PIL.Images ou tensores.
+        """
         if not items_list:
             return []
+        # calcula dims downscaled (múltiplo de patch temporal)
+        down_h, down_w = self._calculate_downscaled_dims(height, width)
+        # ajusta padding para múltiplos de 8
+        pad_h = ((down_h - 1) // 8 + 1) * 8
+        pad_w = ((down_w - 1) // 8 + 1) * 8
+        padding = calculate_padding(down_h, down_w, pad_h, pad_w)
+        conditioning_items: List[ConditioningItem] = []
         for media, frame_idx, weight in items_list:
+            # carrega raw_item como PIL.Image ou tensor
             if isinstance(media, str):
+                img = Image.open(media).convert("RGB")
+                raw_item = ImageOps.fit(img, (down_w, down_h), Image.LANCZOS)
+            elif isinstance(media, Image.Image):
+                raw_item = ImageOps.fit(media, (down_w, down_h), Image.LANCZOS)
+            elif isinstance(media, torch.Tensor):
+                raw_item = media.to(device=self.device, dtype=self.runtime_autocast_dtype)
+            else:
+                raise TypeError(f"Tipo de media não suportado: {type(media)}")
+            # garante frame index seguro
+            safe_frame = max(0, min(int(frame_idx), num_frames - 1))
+            # codifica raw_item em latentes via VAE (inclui expansão de frame se precisar)
+            cond_item = self.encode_conditioning_item(
+                raw_item,
+                frame_number=safe_frame,
+                strength=float(weight),
+                height=down_h,
+                width=down_w,
+                vae_per_channel_normalize=self.vae_per_channel_normalize,
+            )
+            conditioning_items.append(cond_item)
         return conditioning_items
     def generate_low_resolution(
                     pixel_chunk = vae_manager_singleton.decode(chunk.to(self.device), decode_timestep=float(self.config.get("decode_timestep", 0.05)))
                     pixel_chunks.append(pixel_chunk)
+            final_pixel_tensor = self._merge_chunks_with_overlap(pixel_chunks)
+            final_video_path = self._save_video_from_tensor(final_pixel_tensor, f"final_video_{seed}", seed, temp_dir, fps=fps)
+            return final_video_path
+        except Exception as e:
+            print(f"[ERROR] Falha ao encodar latentes para MP4: {e}")
+            traceback.print_exc()
+            raise
+        finally:
+            self._finalize()
+    # --------------------------------------------------------------------------
+    # --- Métodos Internos e Auxiliares ---
+    # --------------------------------------------------------------------------
+    def _finalize(self):
+        """Limpa a memória da GPU e os diretórios temporários."""
+        if LTXV_DEBUG:
+            print("[DEBUG] Finalize: iniciando limpeza...")
+        gc.collect()
+        if torch.cuda.is_available():
+            torch.cuda.empty_cache()
+            torch.cuda.ipc_collect()
+        # Limpa todos os diretórios temporários registrados
+        for d in list(self._tmp_dirs):
+            shutil.rmtree(d, ignore_errors=True)
+            self._tmp_dirs.remove(d)
+            if LTXV_DEBUG:
+                print(f"[DEBUG] Diretório temporário removido: {d}")
+    def _load_config(self, config_filename: str) -> Dict:
+        """Carrega o arquivo de configuração YAML."""
+        config_path = LTX_VIDEO_REPO_DIR / "configs" / config_filename
+        print(f"[INFO] Carregando configuração de: {config_path}")
+        with open(config_path, "r") as file:
+            return yaml.safe_load(file)
+    def _load_models_from_hub(self) -> Tuple[LTXMultiScalePipeline, Optional[torch.nn.Module]]:
+        """Baixa e cria as instâncias da pipeline e do upsampler."""
+        t0 = time.perf_counter()
+        LTX_REPO = "Lightricks/LTX-Video"
+        print("[INFO] Baixando checkpoint principal...")
+        self.config["checkpoint_path"] = hf_hub_download(
+            repo_id=LTX_REPO, filename=self.config["checkpoint_path"],
+            token=os.getenv("HF_TOKEN")
+        )
+        print(f"[INFO] Checkpoint principal em: {self.config['checkpoint_path']}")
+        print("[INFO] Construindo pipeline...")
+        pipeline = create_ltx_video_pipeline(
+            ckpt_path=self.config["checkpoint_path"],
+            precision=self.config["precision"],
+            text_encoder_model_name_or_path=self.config["text_encoder_model_name_or_path"],
+            sampler=self.config["sampler"],
+            device="cpu",  # Carrega em CPU primeiro
+            enhance_prompt=False
+        )
+        print("[INFO] Pipeline construída.")
+        latent_upsampler = None
+        if self.config.get("spatial_upscaler_model_path"):
+            print("[INFO] Baixando upscaler espacial...")
+            self.config["spatial_upscaler_model_path"] = hf_hub_download(
+                repo_id=LTX_REPO, filename=self.config["spatial_upscaler_model_path"],
+                token=os.getenv("HF_TOKEN")
+            )
+            print(f"[INFO] Upscaler em: {self.config['spatial_upscaler_model_path']}")
+            print("[INFO] Construindo latent_upsampler...")
+            latent_upsampler = create_latent_upsampler(self.config["spatial_upscaler_model_path"], device="cpu")
+            print("[INFO] Latent upsampler construído.")
+        print(f"[INFO] Carregamento de modelos concluído em {time.perf_counter()-t0:.2f}s")
+        return pipeline, latent_upsampler
+    def _move_models_to_device(self):
+        """Move os modelos carregados para o dispositivo de computação (GPU/CPU)."""
+        print(f"[INFO] Movendo modelos para o dispositivo: {self.device}")
+        self.pipeline.to(self.device)
+        if self.latent_upsampler:
+            self.latent_upsampler.to(self.device)
+    def _get_precision_dtype(self) -> torch.dtype:
+        """Determina o dtype para autocast com base na configuração de precisão."""
+        prec = str(self.config.get("precision", "")).lower()
+        if prec in ["float8_e4m3fn", "bfloat16"]:
+            return torch.bfloat16
+        elif prec == "mixed_precision":
+            return torch.float16
+        return torch.float32
+    @torch.no_grad()
+    def _upsample_and_filter_latents(self, latents: torch.Tensor) -> torch.Tensor:
+        """Aplica o upsample espacial e o filtro AdaIN aos latentes."""
+        if not self.latent_upsampler:
+            raise ValueError("Latent Upsampler não está carregado para a operação de upscale.")
+        latents_unnormalized = un_normalize_latents(latents, self.pipeline.vae, vae_per_channel_normalize=True)
+        upsampled_latents_unnormalized = self.latent_upsampler(latents_unnormalized)
+        upsampled_latents_normalized = normalize_latents(upsampled_latents_unnormalized, self.pipeline.vae, vae_per_channel_normalize=True)
+        # Filtro AdaIN para manter consistência de cor/estilo com o vídeo de baixa resolução
+        return adain_filter_latent(latents=upsampled_latents_normalized, reference_latents=latents)
+    def _prepare_conditioning_tensor_from_path(self, filepath: str, height: int, width: int, padding: Tuple) -> torch.Tensor:
+        """Carrega uma imagem, redimensiona, aplica padding e move para o dispositivo."""
+        tensor = load_image_to_tensor_with_resize_and_crop(filepath, height, width)
+        tensor = F.pad(tensor, padding)
+        return tensor.to(self.device, dtype=self.runtime_autocast_dtype)
+    def _calculate_downscaled_dims(self, height: int, width: int) -> Tuple[int, int]:
+        """Calcula as dimensões para o primeiro passo (baixa resolução)."""
+        height_padded = ((height - 1) // 8 + 1) * 8
+        width_padded = ((width - 1) // 8 + 1) * 8
+        downscale_factor = self.config.get("downscale_factor", 0.6666666)
+        vae_scale_factor = self.pipeline.vae_scale_factor
+        target_w = int(width_padded * downscale_factor)
+        downscaled_width = target_w - (target_w % vae_scale_factor)
+        target_h = int(height_padded * downscale_factor)
+        downscaled_height = target_h - (target_h % vae_scale_factor)
+        return downscaled_height, downscaled_width
+    def _split_latents_with_overlap(self, latents: torch.Tensor, overlap: int = 1) -> List[torch.Tensor]:
+        """Divide um tensor de latentes em dois chunks com sobreposição."""
+        total_frames = latents.shape[2]
+        if total_frames <= overlap:
+            return [latents]
+        mid_point = max(overlap, total_frames // 2)
+        chunk1 = latents[:, :, :mid_point, :, :]
+        # O segundo chunk começa 'overlap' frames antes para criar a sobreposição
+        chunk2 = latents[:, :, mid_point - overlap:, :, :]
+        return [c for c in [chunk1, chunk2] if c.shape[2] > 0]
+    def _merge_chunks_with_overlap(self, chunks: List[torch.Tensor], overlap: int = 1) -> torch.Tensor:
+        """Junta uma lista de chunks, removendo a sobreposição."""
+        if not chunks:
+            return torch.empty(0)
+        if len(chunks) == 1:
+            return chunks[0]
+        # Pega o primeiro chunk sem o frame de sobreposição final
+        merged_list = [chunks[0][:, :, :-overlap, :, :]]
+        # Adiciona os chunks restantes
+        merged_list.extend(chunks[1:])
+        return torch.cat(merged_list, dim=2)
+    def _save_latents_to_disk(self, latents_tensor: torch.Tensor, base_filename: str, seed: int) -> str:
+        """Salva um tensor de latentes em um arquivo .pt."""
+        latents_cpu = latents_tensor.detach().to("cpu")
+        tensor_path = RESULTS_DIR / f"{base_filename}_{seed}.pt"
+        torch.save(latents_cpu, tensor_path)
+        if LTXV_DEBUG:
+            print(f"[DEBUG] Latentes salvos em: {tensor_path}")
+        return str(tensor_path)
+    def _save_video_from_tensor(self, pixel_tensor: torch.Tensor, base_filename: str, seed: int, temp_dir: str, fps: int = int(DEFAULT_FPS)) -> str:
+        """Salva um tensor de pixels como um arquivo de vídeo MP4."""
+        temp_path = os.path.join(temp_dir, f"{base_filename}_{seed}.mp4")
+        video_encode_tool_singleton.save_video_from_tensor(pixel_tensor, temp_path, fps=fps)
+        final_path = RESULTS_DIR / f"{base_filename}_{seed}.mp4"
+        shutil.move(temp_path, final_path)
+        print(f"[INFO] Vídeo final salvo em: {final_path}")
+        return str(final_path)
+    def _register_tmp_dir(self, dir_path: str):
+        """Registra um diretório temporário para limpeza posterior."""
+        if dir_path and os.path.isdir(dir_path):
+            self._tmp_dirs.add(dir_path)
+            if LTXV_DEBUG:
+                print(f"[DEBUG] Diretório temporário registrado: {dir_path}")
+    def _seed_everething(self, seed: int):
+        random.seed(seed)
+        np.random.seed(seed)
+        torch.manual_seed(seed)
+        if torch.cuda.is_available():
+            torch.cuda.manual_seed(seed)
+        if torch.backends.mps.is_available():
+            torch.mps.manual_seed(seed)
+# ==============================================================================
+# 4. INSTANCIAÇÃO E PONTO DE ENTRADA (Exemplo)
+# ==============================================================================
+video_generation_service = VideoService()
+print("Instância do VideoService pronta para uso.")