Test

Paused

App Files Files Community

eeuuia commited on Oct 12

Commit

460fa35

verified ·

1 Parent(s): ae38dbc

Update api/ltx/ltx_aduc_manager.py

Browse files

Files changed (1) hide show

api/ltx/ltx_aduc_manager.py +215 -132

api/ltx/ltx_aduc_manager.py CHANGED Viewed

@@ -1,160 +1,243 @@
 # FILE: api/ltx/ltx_aduc_manager.py
-# DESCRIPTION: The "secret weapon". A pool manager for LTX that applies
-# runtime patches to the pipeline for full control and ADUC-SDR compatibility.
 import logging
-from typing import Dict, List, Optional, Tuple, Union
-from dataclasses import dataclass
 import torch
-from diffusers.utils.torch_utils import randn_tensor
 import sys
 from pathlib import Path
-import os
-import random
-import yaml
-LTX_REPO_ID = "Lightricks/LTX-Video"
-DEPS_DIR = Path("/data")
-LTX_VIDEO_REPO_DIR = DEPS_DIR / "LTX-Video"
-RESULTS_DIR = Path("/app/output")
-# --- Importações da nossa arquitetura ---
 from managers.gpu_manager import gpu_manager
-from api.ltx.ltx_utils import build_ltx_pipeline_on_cpu
 def add_deps_to_path():
-    """
-    Adiciona o diretório do repositório LTX ao sys.path para garantir que suas
-    bibliotecas possam ser importadas.
-    """
     repo_path = str(LTX_VIDEO_REPO_DIR.resolve())
     if repo_path not in sys.path:
         sys.path.insert(0, repo_path)
-        logging.info(f"[ltx_utils] LTX-Video repository added to sys.path: {repo_path}")
-# Executa a função imediatamente para configurar o ambiente antes de qualquer importação.
 add_deps_to_path()
-from ltx_video.pipelines.pipeline_ltx_video import LTXVideoPipeline
-# --- Definição dos nossos Data Classes ---
-@dataclass
-class ConditioningItem:
-    pixel_tensor: torch.Tensor # Sempre um tensor de pixel
-    media_frame_number: int
-    conditioning_strength: float
-@dataclass
-class LatentConditioningItem:
-    latent_tensor: torch.Tensor # Sempre um tensor latente
-    media_frame_number: int
-    conditioning_strength: float
 # ==============================================================================
-# --- O MONKEY PATCH ---
-# Esta é a nossa versão customizada de `prepare_conditioning`
 # ==============================================================================
-def _aduc_prepare_conditioning_patch(
-    self: "LTXVideoPipeline",
-    conditioning_items: Optional[List[Union["ConditioningItem", "LatentConditioningItem"]]],
-    init_latents: torch.Tensor,
-    num_frames: int,
-    height: int,
-    width: int,
-    vae_per_channel_normalize: bool = False,
-    generator=None,
-) -> Tuple[torch.Tensor, torch.Tensor, torch.Tensor, int]:
-    if not conditioning_items:
-        init_latents, init_latent_coords = self.patchifier.patchify(latents=init_latents)
-        init_pixel_coords = latent_to_pixel_coords(init_latent_coords, self.vae, causal_fix=self.transformer.config.causal_temporal_positioning)
-        return init_latents, init_pixel_coords, None, 0
-    init_conditioning_mask = torch.zeros_like(init_latents[:, 0, ...], dtype=torch.float32, device=init_latents.device)
-    extra_conditioning_latents, extra_conditioning_pixel_coords, extra_conditioning_mask = [], [], []
-    extra_conditioning_num_latents = 0
-    for item in conditioning_items:
-        if not isinstance(item, LatentConditioningItem):
-            logger.warning("Patch ADUC: Item de condicionamento não é um LatentConditioningItem e será ignorado.")
-            continue
-        media_item_latents = item.latent_tensor.to(dtype=init_latents.dtype, device=init_latents.device)
-        media_frame_number, strength = item.media_frame_number, item.conditioning_strength
-        if media_frame_number == 0:
-            f_l, h_l, w_l = media_item_latents.shape[-3:]
-            init_latents[..., :f_l, :h_l, :w_l] = torch.lerp(init_latents[..., :f_l, :h_l, :w_l], media_item_latents, strength)
-            init_conditioning_mask[..., :f_l, :h_l, :w_l] = strength
-        else:
-            noise = randn_tensor(media_item_latents.shape, generator=generator, device=media_item_latents.device, dtype=media_item_latents.dtype)
-            media_item_latents = torch.lerp(noise, media_item_latents, strength)
-            patched_latents, latent_coords = self.patchifier.patchify(latents=media_item_latents)
-            pixel_coords = latent_to_pixel_coords(latent_coords, self.vae, causal_fix=self.transformer.config.causal_temporal_positioning)
-            pixel_coords[:, 0] += media_frame_number
-            extra_conditioning_num_latents += patched_latents.shape[1]
-            new_mask = torch.full(patched_latents.shape[:2], strength, dtype=torch.float32, device=init_latents.device)
-            extra_conditioning_latents.append(patched_latents)
-            extra_conditioning_pixel_coords.append(pixel_coords)
-            extra_conditioning_mask.append(new_mask)
-    init_latents, init_latent_coords = self.patchifier.patchify(latents=init_latents)
-    init_pixel_coords = latent_to_pixel_coords(init_latent_coords, self.vae, causal_fix=self.transformer.config.causal_temporal_positioning)
-    init_conditioning_mask, _ = self.patchifier.patchify(latents=init_conditioning_mask.unsqueeze(1))
-    init_conditioning_mask = init_conditioning_mask.squeeze(-1)
-    if extra_conditioning_latents:
-        init_latents = torch.cat([*extra_conditioning_latents, init_latents], dim=1)
-        init_pixel_coords = torch.cat([*extra_conditioning_pixel_coords, init_pixel_coords], dim=2)
-        init_conditioning_mask = torch.cat([*extra_conditioning_mask, init_conditioning_mask], dim=1)
-    return init_latents, init_pixel_coords, init_conditioning_mask, extra_conditioning_num_latents
 # ==============================================================================
-# --- LTX Worker e Pool Manager ---
 # ==============================================================================
-class LTXWorker:
-    """Gerencia uma instância do LTX Pipeline em um par de GPUs (main + vae)."""
-    def __init__(self, main_device: str, vae_device: str, config: dict):
-        self.main_device = torch.device(main_device)
-        self.vae_device = torch.device(vae_device)
-        self.config = config
-        self.pipeline: LTXVideoPipeline = None
-        self._load_and_patch_pipeline()
-    def _load_and_patch_pipeline(self):
-        logging.info(f"[LTXWorker-{self.main_device}] Carregando pipeline LTX para a CPU...")
-        self.pipeline, _ = build_ltx_pipeline_on_cpu(self.config)
-        logging.info(f"[LTXWorker-{self.main_device}] Movendo pipeline para GPUs (Main: {self.main_device}, VAE: {self.vae_device})...")
-        self.pipeline.to(self.main_device)
-        self.pipeline.vae.to(self.vae_device)
-        logging.info(f"[LTXWorker-{self.main_device}] Aplicando patch ADUC-SDR na função 'prepare_conditioning'...")
-        # A "mágica" do monkey patching acontece aqui
-        self.pipeline.prepare_conditioning = _aduc_prepare_conditioning_patch.__get__(self.pipeline, LTXVideoPipeline)
-        logging.info(f"[LTXWorker-{self.main_device}] ✅ Pipeline 'quente', corrigido e pronto.")
-class LTXAducManager:
-    def __init__(self):
-        main_device = gpu_manager.get_ltx_device()
-        vae_device = gpu_manager.get_ltx_vae_device()
-        # Em uma arquitetura futura, poderíamos ter múltiplos workers. Por enquanto, temos um.
-        self.worker = LTXWorker(str(main_device), str(vae_device), self.load_config())
-    def load_config(self) -> Dict:
-       """Loads the YAML configuration file."""
-       config_path = LTX_VIDEO_REPO_DIR / "configs" / "ltxv-13b-0.9.8-distilled-fp8.yaml"
-       with open(config_path, "r") as file:
-          return yaml.safe_load(file)
-    def get_pipeline(self) -> LTXVideoPipeline:
-        return self.worker.pipeline
-# Instância Singleton
-ltx_aduc_manager = LTXAducManager()

 # FILE: api/ltx/ltx_aduc_manager.py
+# DESCRIPTION: An advanced, fault-tolerant pool manager for LTX and VAE workers.
+# It handles job queuing, load balancing, and health monitoring for production-grade stability.
 import logging
 import torch
 import sys
 from pathlib import Path
+import threading
+import queue
+import time
+from typing import List, Optional, Callable, Any, Tuple
+# Imports dos builders e do gpu_manager
+from api.ltx.ltx_utils import get_main_ltx_pipeline, get_main_vae
 from managers.gpu_manager import gpu_manager
+# --- Adiciona o path do LTX-Video para importação de tipos ---
+LTX_VIDEO_REPO_DIR = Path("/data/LTX-Video")
 def add_deps_to_path():
     repo_path = str(LTX_VIDEO_REPO_DIR.resolve())
     if repo_path not in sys.path:
         sys.path.insert(0, repo_path)
 add_deps_to_path()
+from ltx_video.pipelines.pipeline_ltx_video import LTXVideoPipeline
+from ltx_video.models.autoencoders.causal_video_autoencoder import CausalVideoAutoencoder
 # ==============================================================================
+# --- CLASSES DE WORKER (Especialistas em Tarefas) ---
 # ==============================================================================
+class BaseWorker(threading.Thread):
+    """Classe base para nossos workers com gerenciamento de estado e saúde."""
+    def __init__(self, worker_id: int, device: torch.device):
+        super().__init__()
+        self.worker_id = worker_id
+        self.device = device
+        self.is_healthy = False
+        self.is_busy = False
+        self.daemon = True # Permite que o programa principal saia
+    def run(self):
+        """O loop de vida do worker, responsável por carregar os modelos."""
+        try:
+            self._load_models()
+            self.is_healthy = True
+            logging.info(f"✅ Worker {self.worker_id} ({self.__class__.__name__}) on {self.device} is healthy and ready.")
+        except Exception:
+            self.is_healthy = False
+            logging.error(f"❌ Worker {self.worker_id} on {self.device} FAILED to initialize!", exc_info=True)
+    def _load_models(self):
+        """Método a ser implementado pelas classes filhas."""
+        raise NotImplementedError
+    def get_status(self) -> Tuple[bool, bool]:
+        """Retorna (is_healthy, is_busy)."""
+        return self.is_healthy, self.is_busy
+class LTXMainWorker(BaseWorker):
+    """Worker especialista para o pipeline principal do LTX."""
+    def __init__(self, worker_id: int, device: torch.device):
+        super().__init__(worker_id, device)
+        self.pipeline: Optional[LTXVideoPipeline] = None
+    def _load_models(self):
+        logging.info(f"[LTXWorker-{self.worker_id}] Loading models to CPU...")
+        self.pipeline = get_main_ltx_pipeline()
+        logging.info(f"[LTXWorker-{self.worker_id}] Moving pipeline to {self.device}...")
+        self.pipeline.to(self.device)
+    def execute(self, job_func: Callable, args: tuple, kwargs: dict) -> Any:
+        """Executa um trabalho, gerenciando o estado 'busy'."""
+        self.is_busy = True
+        logging.info(f"Worker {self.worker_id} (LTX) starting job: {job_func.__name__}")
+        try:
+            result = job_func(self.pipeline, *args, **kwargs)
+            logging.info(f"Worker {self.worker_id} (LTX) finished job successfully.")
+            return result
+        except Exception as e:
+            logging.error(f"Worker {self.worker_id} (LTX) job failed!", exc_info=True)
+            self.is_healthy = False # Falha em um job marca o worker como não saudável
+            raise
+        finally:
+            self.is_busy = False
+class VAEWorker(BaseWorker):
+    """Worker especialista para o modelo VAE."""
+    def __init__(self, worker_id: int, device: torch.device):
+        super().__init__(worker_id, device)
+        self.vae: Optional[CausalVideoAutoencoder] = None
+    def _load_models(self):
+        logging.info(f"[VAEWorker-{self.worker_id}] Loading VAE model to CPU...")
+        self.vae = get_main_vae()
+        logging.info(f"[VAEWorker-{self.worker_id}] Moving VAE to {self.device}...")
+        self.vae.to(self.device)
+        self.vae.eval()
+    def execute(self, job_func: Callable, args: tuple, kwargs: dict) -> Any:
+        """Executa um trabalho, gerenciando o estado 'busy'."""
+        self.is_busy = True
+        logging.info(f"Worker {self.worker_id} (VAE) starting job: {job_func.__name__}")
+        try:
+            result = job_func(self.vae, *args, **kwargs)
+            logging.info(f"Worker {self.worker_id} (VAE) finished job successfully.")
+            return result
+        except Exception as e:
+            logging.error(f"Worker {self.worker_id} (VAE) job failed!", exc_info=True)
+            self.is_healthy = False
+            raise
+        finally:
+            self.is_busy = False
 # ==============================================================================
+# --- O GERENCIADOR DE POOL AVANÇADO (SINGLETON) ---
 # ==============================================================================
+class LTXAducManager:
+    _instance = None
+    _initialized = False
+    def __new__(cls, *args, **kwargs):
+        if cls._instance is None:
+            cls._instance = super().__new__(cls)
+        return cls._instance
+    def __init__(self):
+        if self._initialized: return
+        logging.info("🏭 Initializing Advanced Pool Manager for LTX...")
+        self.ltx_workers: List[LTXMainWorker] = []
+        self.vae_workers: List[VAEWorker] = []
+        self.ltx_job_queue = queue.Queue()
+        self.vae_job_queue = queue.Queue()
+        self.pool_lock = threading.Lock()
+        self._initialize_workers()
+        # Inicia threads consumidores para processar as filas
+        self.ltx_dispatcher = threading.Thread(target=self._dispatch_jobs, args=(self.ltx_job_queue, self.ltx_workers), daemon=True)
+        self.vae_dispatcher = threading.Thread(target=self._dispatch_jobs, args=(self.vae_job_queue, self.vae_workers), daemon=True)
+        self.health_monitor = threading.Thread(target=self._health_check_loop, daemon=True)
+        self.ltx_dispatcher.start()
+        self.vae_dispatcher.start()
+        self.health_monitor.start()
+        self._initialized = True
+        logging.info("✅ Advanced Pool Manager is running with all threads started.")
+    def _initialize_workers(self):
+        """Cria e inicia os workers com base nas GPUs alocadas."""
+        # Supondo que gpu_manager agora tenha get_ltx_devices() e get_seedvr_devices() que retornam listas
+        ltx_gpus = gpu_manager.get_ltx_device() # Ajuste se o nome for diferente
+        vae_gpus = gpu_manager.get_ltx_vae_device() # Ajuste se o nome for diferente
+        with self.pool_lock:
+            for i, device_id in enumerate([ltx_gpus]): # Assumindo que retorna uma lista
+                worker = LTXMainWorker(worker_id=i, device=torch.device(f"cuda:{device_id}"))
+                self.ltx_workers.append(worker)
+                worker.start()
+            for i, device_id in enumerate([vae_gpus]): # Assumindo que retorna uma lista
+                worker = VAEWorker(worker_id=i, device=torch.device(f"cuda:{device_id}"))
+                self.vae_workers.append(worker)
+                worker.start()
+    def _get_available_worker(self, worker_pool: List[BaseWorker]) -> Optional[BaseWorker]:
+        """Encontra um worker saudável e desocupado no pool."""
+        with self.pool_lock:
+            for worker in worker_pool:
+                healthy, busy = worker.get_status()
+                if healthy and not busy:
+                    return worker
+        return None
+    def _dispatch_jobs(self, job_queue: queue.Queue, worker_pool: List[BaseWorker]):
+        """Loop do thread consumidor que pega trabalhos da fila e os despacha."""
+        while True:
+            job_func, args, kwargs, future = job_queue.get()
+            worker = None
+            while worker is None:
+                worker = self._get_available_worker(worker_pool)
+                if worker is None:
+                    time.sleep(0.1) # Espera por um worker ficar livre
+            try:
+                result = worker.execute(job_func, args, kwargs)
+                future.put(result)
+            except Exception as e:
+                future.put(e)
+    def _health_check_loop(self):
+        """Thread que periodicamente verifica e reinicia workers não saudáveis."""
+        while True:
+            time.sleep(30)
+            logging.debug("Running health check on all workers...")
+            with self.pool_lock:
+                for i, worker in enumerate(self.ltx_workers):
+                    if not worker.is_alive() or not worker.is_healthy:
+                        logging.warning(f"LTX Worker {worker.worker_id} on {worker.device} is UNHEALTHY. Restarting...")
+                        new_worker = LTXMainWorker(worker.worker_id, worker.device)
+                        self.ltx_workers[i] = new_worker
+                        new_worker.start()
+                # Repetir o laço para VAE workers
+                for i, worker in enumerate(self.vae_workers):
+                    if not worker.is_alive() or not worker.is_healthy:
+                        logging.warning(f"VAE Worker {worker.worker_id} on {worker.device} is UNHEALTHY. Restarting...")
+                        new_worker = VAEWorker(worker.worker_id, worker.device)
+                        self.vae_workers[i] = new_worker
+                        new_worker.start()
+    def submit_job(self, job_type: str, job_func: Callable, *args, **kwargs) -> Any:
+        """
+        Ponto de entrada público para submeter um trabalho ao pool.
+        Esta função é síncrona: ela espera pelo resultado.
+        """
+        if job_type not in ['ltx', 'vae']:
+            raise ValueError("Invalid job_type. Must be 'ltx' or 'vae'.")
+        job_queue = self.ltx_job_queue if job_type == 'ltx' else self.vae_job_queue
+        future = queue.Queue() # Usamos uma fila como um 'future' para obter o resultado de volta
+        job_queue.put((job_func, args, kwargs, future))
+        # Bloqueia e espera pelo resultado ser colocado no 'future' pelo dispatcher
+        result = future.get()
+        if isinstance(result, Exception):
+            raise result # Se o job falhou, re-lança a exceção no thread principal
+        return result
+# ==============================================================================
+# --- INSTANCIAÇÃO GLOBAL ---
+# ==============================================================================
+try:
+    ltx_aduc_manager = LTXAducManager()
+except Exception as e:
+    logging.critical("CRITICAL ERROR: Failed to initialize the LTXAducManager pool.", exc_info=True)
+    ltx_aduc_manager = None