Test / managers /gpu_manager.py
eeuuia's picture
Rename manager/gpu_manager.py to managers/gpu_manager.py
dfbfabf verified
raw
history blame
5.43 kB
# FILE: managers/gpu_manager.py
# DESCRIPTION: A hardware-aware, service-agnostic GPU allocator for the ADUC-SDR suite.
# This module inspects available GPUs and partitions them according to a predefined
import os
import torch
import math
import logging
from typing import List
class GPUManager:
"""
Manages and allocates available GPUs among different services.
It operates agnostically, providing device information without knowing
the specifics of the services that will use them.
"""
def __init__(self):
"""Initializes the manager, detects GPUs, and runs the allocation logic."""
self.total_gpus = torch.cuda.device_count()
self.ltx_main_gpus = []
self.ltx_vae_gpu = []
self.seedvr_gpus = []
self.vincie_gpus = []
self._allocate_gpus()
def _allocate_gpus(self):
"""
Implements the GPU allocation strategy based on the total number of detected GPUs.
"""
logging.info("="*60)
logging.info("🤖 Initializing GPU Manager (LTX, SeedVR, VINCIE)")
logging.info(f" > Total GPUs detected: {self.total_gpus}")
all_indices = list(range(self.total_gpus))
if self.total_gpus == 0:
logging.warning(" > No GPUs detected. All services will operate in CPU mode.")
elif self.total_gpus == 1:
logging.warning(" > 1 GPU detected. All services will share GPU 0. Memory swapping will be active.")
self.ltx_main_gpus = [0]
self.ltx_vae_gpu = [0] # Shares with the main LTX pipeline
self.seedvr_gpus = [0]
self.vincie_gpus = [0]
elif self.total_gpus == 2:
logging.info(" > 2 GPUs detected. LTX will use a dedicated VAE device.")
self.ltx_main_gpus = [0]
self.ltx_vae_gpu = [1] # VAE gets the second GPU
self.seedvr_gpus = [0] # Shares with main LTX
self.vincie_gpus = [0] # Shares with main LTX
else: # 3 or more GPUs
logging.info(f" > {self.total_gpus} GPUs detected. Distributing allocation.")
# LTX always gets the first two GPUs if available for optimal performance
self.ltx_main_gpus = [0]
self.ltx_vae_gpu = [1]
remaining_gpus = all_indices[2:]
# The rest are divided between SeedVR and VINCIE
# VINCIE gets priority as it can scale well with more GPUs
vincie_count = max(1, math.ceil(len(remaining_gpus) / 2))
seedvr_count = len(remaining_gpus) - vincie_count
self.vincie_gpus = remaining_gpus[:vincie_count]
# If there are GPUs left, assign them to SeedVR
if seedvr_count > 0:
self.seedvr_gpus = remaining_gpus[vincie_count:]
else:
# If no GPUs are left for SeedVR, it shares with the main LTX GPU
self.seedvr_gpus = [0]
logging.info(f" > Final Allocation:")
logging.info(f" - LTX (Transformer): GPUs {self.ltx_main_gpus}")
logging.info(f" - LTX (VAE): GPU {self.ltx_vae_gpu[0] if self.ltx_vae_gpu else 'N/A'}")
logging.info(f" - SeedVR: GPUs {self.seedvr_gpus}")
logging.info(f" - VINCIE: GPUs {self.vincie_gpus}")
logging.info("="*60)
def get_ltx_device(self) -> torch.device:
"""Returns the primary device for the LTX Transformer pipeline."""
if not self.ltx_main_gpus:
return torch.device("cpu")
return torch.device(f"cuda:{self.ltx_main_gpus[0]}")
def get_ltx_vae_device(self) -> torch.device:
"""Returns the dedicated device for the LTX VAE."""
if not self.ltx_vae_gpu:
return torch.device("cpu")
return torch.device(f"cuda:{self.ltx_vae_gpu[0]}")
def get_seedvr_devices(self) -> List[int]:
"""Returns the list of GPU indices for the SeedVR service."""
return self.seedvr_gpus
def get_vincie_devices(self) -> List[int]:
"""Returns the list of GPU indices for the VINCIE service."""
return self.vincie_gpus
def requires_memory_swap(self) -> bool:
"""
Determines if memory swapping is necessary because multiple services
are sharing the same primary GPU.
The dedicated VAE GPU is not considered for swapping logic.
"""
# Collect all GPUs used by the main, memory-intensive parts of the services
all_main_allocations = self.ltx_main_gpus + self.seedvr_gpus + self.vincie_gpus
# Count how many services are allocated to each unique GPU
gpu_usage_count = {}
for gpu_idx in all_main_allocations:
gpu_usage_count[gpu_idx] = gpu_usage_count.get(gpu_idx, 0) + 1
# Swapping is required if any GPU is used by more than one service
for gpu_idx in gpu_usage_count:
if gpu_usage_count[gpu_idx] > 1:
logging.warning(f"Memory swapping is ACTIVE because GPU {gpu_idx} is shared by multiple services.")
return True
logging.info("Memory swapping is INACTIVE. Each service has dedicated primary GPUs.")
return False
# --- Singleton Instantiation ---
# This global instance is created once and imported by all other modules.
gpu_manager = GPUManager()