Test

Paused

App Files Files Community

eeuuia commited on Oct 12

Commit

52c58b6

verified ·

1 Parent(s): 24228a2

Update api/ltx/ltx_aduc_manager.py

Browse files

Files changed (1) hide show

api/ltx/ltx_aduc_manager.py +90 -142

api/ltx/ltx_aduc_manager.py CHANGED Viewed

@@ -1,6 +1,7 @@
 # FILE: api/ltx/ltx_aduc_manager.py
-# DESCRIPTION: An advanced, fault-tolerant pool manager for LTX and VAE workers.
-# It is self-contained, orchestrating the construction, health, and job dispatching for its workers.
 import logging
 import torch
@@ -10,12 +11,13 @@ import threading
 import queue
 import time
 import yaml
 from huggingface_hub import hf_hub_download
 from typing import List, Optional, Callable, Any, Tuple, Dict
-import os
 # --- Importa o gerenciador de GPUs e o builder de baixo nível ---
 from managers.gpu_manager import gpu_manager
-from api.ltx.ltx_utils import build_components_on_cpu
 # --- Adiciona o path do LTX-Video para importação de tipos ---
 LTX_VIDEO_REPO_DIR = Path("/data/LTX-Video")
@@ -26,174 +28,138 @@ def add_deps_to_path():
 add_deps_to_path()
 from ltx_video.pipelines.pipeline_ltx_video import LTXVideoPipeline
-from ltx_video.models.autoencoders.causal_video_autoencoder import CausalVideoAutoencoder
 # ==============================================================================
-# --- CLASSES DE WORKER (Especialistas em Tarefas) ---
 # ==============================================================================
-class BaseWorker(threading.Thread):
-    """Classe base para nossos workers com gerenciamento de estado e saúde."""
-    def __init__(self, worker_id: int, device: torch.device, model: torch.nn.Module):
         super().__init__()
         self.worker_id = worker_id
-        self.device = device
-        self.model = model
         self.is_healthy = False
         self.is_busy = False
         self.daemon = True
     def run(self):
-        """O loop de vida do worker, responsável por mover o modelo para a GPU."""
-        if True:
-            logging.info(f"Worker {self.worker_id} ({self.__class__.__name__}) moving model to {self.device}...")
-            self.model.to(self.device)
-            self._post_load_hook()
             self.is_healthy = True
-            logging.info(f"✅ Worker {self.worker_id} ({self.__class__.__name__}) on {self.device} is healthy and ready.")
-        #except Exception:
             self.is_healthy = False
-            logging.error(f"❌ Worker {self.worker_id} on {self.device} FAILED to initialize!", exc_info=True)
-    def _post_load_hook(self):
-        """Gancho para ações pós-carregamento, como chamar .eval()."""
-        pass
-    def get_status(self) -> Tuple[bool, bool]:
-        return self.is_healthy, self.is_busy
-class LTXMainWorker(BaseWorker):
-    """Worker especialista para o pipeline principal do LTX."""
-    def __init__(self, worker_id: int, device: torch.device, pipeline: LTXVideoPipeline):
-        super().__init__(worker_id, device, pipeline)
-        self.pipeline = self.model
-        self.autocast_dtype: torch.dtype = torch.float32
-    def _post_load_hook(self):
-        self._set_precision_policy()
     def _set_precision_policy(self):
-        if True: #try:
             config_path = LTX_VIDEO_REPO_DIR / "configs" / "ltxv-13b-0.9.8-distilled-fp8.yaml"
-            with open(config_path, "r") as file:
-                config = yaml.safe_load(file)
             precision = str(config.get("precision", "bfloat16")).lower()
             if precision in ["float8_e4m3fn", "bfloat16"]: self.autocast_dtype = torch.bfloat16
             elif precision == "mixed_precision": self.autocast_dtype = torch.float16
-            logging.info(f"[LTXWorker-{self.worker_id}] Autocast precision policy set to {self.autocast_dtype}")
-        #except Exception as e:
-            #logging.warning(f"[LTXWorker-{self.worker_id}] Could not set precision policy from config. Defaulting to float32. Error: {e}")
     def execute(self, job_func: Callable, args: tuple, kwargs: dict) -> Any:
         self.is_busy = True
-        if True: #try:
             result = job_func(self.pipeline, self.autocast_dtype, *args, **kwargs)
             return result
-        #except Exception:
-        #    self.is_healthy = False
-        #    raise
-        #finally:
-        #    self.is_busy = False
-class VAEWorker(BaseWorker):
-    """Worker especialista para o modelo VAE."""
-    def __init__(self, worker_id: int, device: torch.device, vae: CausalVideoAutoencoder):
-        super().__init__(worker_id, device, vae)
-        self.vae = self.model
-    def _post_load_hook(self):
-        self.vae.eval()
-    def execute(self, job_func: Callable, args: tuple, kwargs: dict) -> Any:
-        self.is_busy = True
-        if True: #try:
-            result = job_func(self.vae, *args, **kwargs)
-            return result
-        #except Exception:
-        #    self.is_healthy = False
-        #    raise
-        #finally:
-        #    self.is_busy = False
 # ==============================================================================
-# --- O GERENCIADOR DE POOL AVANÇADO (SINGLETON) ---
 # ==============================================================================
 class LTXAducManager:
     _instance = None
     _initialized = False
     def __new__(cls, *args, **kwargs):
-        if cls._instance is None:
-            cls._instance = super().__new__(cls)
         return cls._instance
     def __init__(self):
         if self._initialized: return
-        logging.info("🏭 Initializing Advanced Pool Manager for LTX...")
-        self.ltx_workers: List[LTXMainWorker] = []
-        self.vae_workers: List[VAEWorker] = []
-        self.ltx_job_queue = queue.Queue()
-        self.vae_job_queue = queue.Queue()
         self.pool_lock = threading.Lock()
-        # Carrega os modelos na CPU antes de criar os workers
-        self.main_pipeline, self.main_vae = self._load_components_once()
         self._initialize_workers()
-        self.ltx_dispatcher = threading.Thread(target=self._dispatch_jobs, args=(self.ltx_job_queue, self.ltx_workers), daemon=True)
-        self.vae_dispatcher = threading.Thread(target=self._dispatch_jobs, args=(self.vae_job_queue, self.vae_workers), daemon=True)
         self.health_monitor = threading.Thread(target=self._health_check_loop, daemon=True)
-        self.ltx_dispatcher.start()
-        self.vae_dispatcher.start()
         self.health_monitor.start()
         self._initialized = True
-        logging.info("✅ Advanced Pool Manager is running with all threads started.")
-    def _load_components_once(self) -> Tuple[LTXVideoPipeline, CausalVideoAutoencoder]:
-        """Orquestra a construção de TODOS os componentes na CPU uma única vez."""
-        logging.info("Manager loading all components onto CPU...")
-        config_path = LTX_VIDEO_REPO_DIR / "configs" / "ltxv-13b-0.9.8-distilled-fp8.yaml"
-        with open(config_path, "r") as file:
-            config = yaml.safe_load(file)
-        ckpt_path = hf_hub_download(repo_id="Lightricks/LTX-Video", filename=config["checkpoint_path"], cache_dir=os.environ.get("HF_HOME"))
-        pipeline, vae = build_components_on_cpu(ckpt_path, config)
-        logging.info("✅ All components loaded to CPU successfully.")
-        return pipeline, vae
     def _initialize_workers(self):
-        """Cria e inicia os workers, injetando os modelos já carregados."""
-        ltx_device = gpu_manager.get_ltx_device()
-        vae_device = gpu_manager.get_ltx_vae_device()
         with self.pool_lock:
-            ltx_worker = LTXMainWorker(worker_id=0, device=ltx_device, pipeline=self.main_pipeline)
-            self.ltx_workers.append(ltx_worker)
-            ltx_worker.start()
-            vae_worker = VAEWorker(worker_id=0, device=vae_device, vae=self.main_vae)
-            self.vae_workers.append(vae_worker)
-            vae_worker.start()
-    def _get_available_worker(self, worker_pool: List[BaseWorker]) -> Optional[BaseWorker]:
         with self.pool_lock:
-            for worker in worker_pool:
-                healthy, busy = worker.get_status()
-                if healthy and not busy: return worker
         return None
-    def _dispatch_jobs(self, job_queue: queue.Queue, worker_pool: List[BaseWorker]):
         while True:
-            job_func, args, kwargs, future = job_queue.get()
             worker = None
             while worker is None:
-                worker = self._get_available_worker(worker_pool)
-                if worker is None: time.sleep(0.1)
             try:
                 result = worker.execute(job_func, args, kwargs)
                 future.put(result)
@@ -204,37 +170,19 @@ class LTXAducManager:
         while True:
             time.sleep(30)
             with self.pool_lock:
-                for i, worker in enumerate(self.ltx_workers):
-                    if not worker.is_alive() or not worker.is_healthy:
-                        logging.warning(f"LTX Worker {worker.worker_id} on {worker.device} is UNHEALTHY. Restarting...")
-                        new_worker = LTXMainWorker(worker.worker_id, worker.device, self.main_pipeline)
-                        self.ltx_workers[i] = new_worker
-                        new_worker.start()
-                for i, worker in enumerate(self.vae_workers):
                     if not worker.is_alive() or not worker.is_healthy:
-                        logging.warning(f"VAE Worker {worker.worker_id} on {worker.device} is UNHEALTHY. Restarting...")
-                        new_worker = VAEWorker(worker.worker_id, worker.device, self.main_vae)
-                        self.vae_workers[i] = new_worker
                         new_worker.start()
-    def submit_job(self, job_type: str, job_func: Callable, *args, **kwargs) -> Any:
-        if job_type not in ['ltx', 'vae']:
-            raise ValueError("Invalid job_type. Must be 'ltx' or 'vae'.")
-        job_queue = self.ltx_job_queue if job_type == 'ltx' else self.vae_job_queue
         future = queue.Queue(1)
-        job_queue.put((job_func, args, kwargs, future))
         result = future.get()
-        if isinstance(result, Exception):
-            raise result
         return result
 # --- INSTANCIAÇÃO GLOBAL ---
-#try:
-ltx_aduc_manager = LTXAducManager()
-#except Exception:
-#    logging.critical("CRITICAL ERROR: Failed to initialize the LTXAducManager pool.", exc_info=True)
-#    ltx_aduc_manager = None

 # FILE: api/ltx/ltx_aduc_manager.py
+# DESCRIPTION: A simplified, robust pool manager for a unified LTX worker.
+# This worker handles all tasks, including Transformer generation and VAE operations,
+# while still respecting the GPU separation defined by the GPUManager.
 import logging
 import torch
 import queue
 import time
 import yaml
+import os
 from huggingface_hub import hf_hub_download
 from typing import List, Optional, Callable, Any, Tuple, Dict
 # --- Importa o gerenciador de GPUs e o builder de baixo nível ---
 from managers.gpu_manager import gpu_manager
+from api.ltx.ltx_utils import build_complete_pipeline_on_cpu, create_transformer
 # --- Adiciona o path do LTX-Video para importação de tipos ---
 LTX_VIDEO_REPO_DIR = Path("/data/LTX-Video")
 add_deps_to_path()
 from ltx_video.pipelines.pipeline_ltx_video import LTXVideoPipeline
 # ==============================================================================
+# --- FUNÇÃO DE ORQUESTRAÇÃO DA CONSTRUÇÃO (Interna ao Manager) ---
 # ==============================================================================
+def get_complete_pipeline() -> LTXVideoPipeline:
+    """
+    Orquestra a construção do pipeline LTX COMPLETO, incluindo o VAE, na CPU.
+    """
+    config_path = LTX_VIDEO_REPO_DIR / "configs" / "ltxv-13b-0.9.8-distilled-fp8.yaml"
+    with open(config_path, "r") as file:
+        config = yaml.safe_load(file)
+    ckpt_path = hf_hub_download(
+        repo_id="Lightricks/LTX-Video",
+        filename=config["checkpoint_path"],
+        cache_dir=os.environ.get("HF_HOME")
+    )
+    return build_complete_pipeline_on_cpu(ckpt_path, config)
+# ==============================================================================
+# --- CLASSE DE WORKER UNIFICADO ---
+# ==============================================================================
+class LTXWorker(threading.Thread):
+    """
+    Um worker unificado que gerencia uma instância completa do pipeline LTX.
+    Ele carrega o modelo e distribui seus componentes (Transformer/VAE) para as GPUs corretas.
+    """
+    def __init__(self, worker_id: int):
         super().__init__()
         self.worker_id = worker_id
+        self.pipeline: Optional[LTXVideoPipeline] = None
         self.is_healthy = False
         self.is_busy = False
         self.daemon = True
+        self.autocast_dtype: torch.dtype = torch.float32
     def run(self):
+        """Inicializa o worker: carrega o pipeline e o move para as GPUs."""
+        try:
+            self.pipeline = get_complete_pipeline()
+            self._set_precision_policy()
+            main_device = gpu_manager.get_ltx_device()
+            vae_device = gpu_manager.get_ltx_vae_device()
+            logging.info(f"[LTXWorker-{self.worker_id}] Moving components -> Main: {main_device}, VAE: {vae_device}")
+            self.pipeline.to(main_device)      # Move tudo para a GPU principal primeiro
+            self.pipeline.vae.to(vae_device)   # Move especificamente o VAE para sua GPU dedicada
             self.is_healthy = True
+            logging.info(f"✅ LTXWorker {self.worker_id} is healthy. Main on {main_device}, VAE on {vae_device}.")
+        except Exception:
             self.is_healthy = False
+            logging.error(f"❌ LTXWorker {self.worker_id} FAILED to initialize!", exc_info=True)
     def _set_precision_policy(self):
+        """Define a política de precisão para operações de autocast."""
+        try:
             config_path = LTX_VIDEO_REPO_DIR / "configs" / "ltxv-13b-0.9.8-distilled-fp8.yaml"
+            with open(config_path, "r") as file: config = yaml.safe_load(file)
             precision = str(config.get("precision", "bfloat16")).lower()
             if precision in ["float8_e4m3fn", "bfloat16"]: self.autocast_dtype = torch.bfloat16
             elif precision == "mixed_precision": self.autocast_dtype = torch.float16
+        except Exception:
+            logging.warning(f"[LTXWorker-{self.worker_id}] Could not set precision policy, defaulting to float32.", exc_info=True)
     def execute(self, job_func: Callable, args: tuple, kwargs: dict) -> Any:
         self.is_busy = True
+        try:
+            # O job recebe o pipeline completo e o dtype para o autocast
             result = job_func(self.pipeline, self.autocast_dtype, *args, **kwargs)
             return result
+        except Exception:
+            self.is_healthy = False
+            raise
+        finally:
+            self.is_busy = False
 # ==============================================================================
+# --- O GERENCIADOR DE POOL (SINGLETON) ---
 # ==============================================================================
 class LTXAducManager:
     _instance = None
     _initialized = False
     def __new__(cls, *args, **kwargs):
+        if cls._instance is None: cls._instance = super().__new__(cls)
         return cls._instance
     def __init__(self):
         if self._initialized: return
+        logging.info("🏭 Initializing Simplified Pool Manager for LTX...")
+        self.workers: List[LTXWorker] = []
+        self.job_queue = queue.Queue()
         self.pool_lock = threading.Lock()
         self._initialize_workers()
+        self.dispatcher = threading.Thread(target=self._dispatch_jobs, daemon=True)
         self.health_monitor = threading.Thread(target=self._health_check_loop, daemon=True)
+        self.dispatcher.start()
         self.health_monitor.start()
         self._initialized = True
+        logging.info("✅ Simplified Pool Manager is running.")
     def _initialize_workers(self):
         with self.pool_lock:
+            # Por enquanto, criamos um único worker unificado.
+            # No futuro, este loop pode criar múltiplos workers se houver mais GPUs.
+            worker = LTXWorker(worker_id=0)
+            self.workers.append(worker)
+            worker.start()
+    def _get_available_worker(self) -> Optional[LTXWorker]:
         with self.pool_lock:
+            for worker in self.workers:
+                if worker.is_healthy and not worker.is_busy:
+                    return worker
         return None
+    def _dispatch_jobs(self):
         while True:
+            job_func, args, kwargs, future = self.job_queue.get()
             worker = None
             while worker is None:
+                worker = self._get_available_worker()
+                if worker is None: time.sleep(0.1)
             try:
                 result = worker.execute(job_func, args, kwargs)
                 future.put(result)
         while True:
             time.sleep(30)
             with self.pool_lock:
+                for i, worker in enumerate(self.workers):
                     if not worker.is_alive() or not worker.is_healthy:
+                        logging.warning(f"LTX Worker {worker.worker_id} is UNHEALTHY. Restarting...")
+                        new_worker = LTXWorker(worker_id=worker.worker_id)
+                        self.workers[i] = new_worker
                         new_worker.start()
+    def submit_job(self, job_func: Callable, *args, **kwargs) -> Any:
         future = queue.Queue(1)
+        self.job_queue.put((job_func, args, kwargs, future))
         result = future.get()
+        if isinstance(result, Exception): raise result
         return result
 # --- INSTANCIAÇÃO GLOBAL ---
+ltx_aduc_manager = LTXAducManager()