Test

Paused

App Files Files Community

Test / managers /gpu_manager.py

eeuuia

Rename manager/gpu_manager.py to managers/gpu_manager.py

dfbfabf verified about 1 month ago

raw

history blame

5.43 kB

	# FILE: managers/gpu_manager.py
	# DESCRIPTION: A hardware-aware, service-agnostic GPU allocator for the ADUC-SDR suite.
	# This module inspects available GPUs and partitions them according to a predefined

	import os
	import torch
	import math
	import logging
	from typing import List

	class GPUManager:
	"""
	Manages and allocates available GPUs among different services.
	It operates agnostically, providing device information without knowing
	the specifics of the services that will use them.
	"""
	def __init__(self):
	"""Initializes the manager, detects GPUs, and runs the allocation logic."""
	self.total_gpus = torch.cuda.device_count()
	self.ltx_main_gpus = []
	self.ltx_vae_gpu = []
	self.seedvr_gpus = []
	self.vincie_gpus = []
	self._allocate_gpus()

	def _allocate_gpus(self):
	"""
	Implements the GPU allocation strategy based on the total number of detected GPUs.
	"""
	logging.info("="*60)
	logging.info("🤖 Initializing GPU Manager (LTX, SeedVR, VINCIE)")
	logging.info(f" > Total GPUs detected: {self.total_gpus}")

	all_indices = list(range(self.total_gpus))

	if self.total_gpus == 0:
	logging.warning(" > No GPUs detected. All services will operate in CPU mode.")
	elif self.total_gpus == 1:
	logging.warning(" > 1 GPU detected. All services will share GPU 0. Memory swapping will be active.")
	self.ltx_main_gpus = [0]
	self.ltx_vae_gpu = [0] # Shares with the main LTX pipeline
	self.seedvr_gpus = [0]
	self.vincie_gpus = [0]
	elif self.total_gpus == 2:
	logging.info(" > 2 GPUs detected. LTX will use a dedicated VAE device.")
	self.ltx_main_gpus = [0]
	self.ltx_vae_gpu = [1] # VAE gets the second GPU
	self.seedvr_gpus = [0] # Shares with main LTX
	self.vincie_gpus = [0] # Shares with main LTX
	else: # 3 or more GPUs
	logging.info(f" > {self.total_gpus} GPUs detected. Distributing allocation.")
	# LTX always gets the first two GPUs if available for optimal performance
	self.ltx_main_gpus = [0]
	self.ltx_vae_gpu = [1]

	remaining_gpus = all_indices[2:]

	# The rest are divided between SeedVR and VINCIE
	# VINCIE gets priority as it can scale well with more GPUs
	vincie_count = max(1, math.ceil(len(remaining_gpus) / 2))
	seedvr_count = len(remaining_gpus) - vincie_count

	self.vincie_gpus = remaining_gpus[:vincie_count]
	# If there are GPUs left, assign them to SeedVR
	if seedvr_count > 0:
	self.seedvr_gpus = remaining_gpus[vincie_count:]
	else:
	# If no GPUs are left for SeedVR, it shares with the main LTX GPU
	self.seedvr_gpus = [0]

	logging.info(f" > Final Allocation:")
	logging.info(f" - LTX (Transformer): GPUs {self.ltx_main_gpus}")
	logging.info(f" - LTX (VAE): GPU {self.ltx_vae_gpu[0] if self.ltx_vae_gpu else 'N/A'}")
	logging.info(f" - SeedVR: GPUs {self.seedvr_gpus}")
	logging.info(f" - VINCIE: GPUs {self.vincie_gpus}")
	logging.info("="*60)

	def get_ltx_device(self) -> torch.device:
	"""Returns the primary device for the LTX Transformer pipeline."""
	if not self.ltx_main_gpus:
	return torch.device("cpu")
	return torch.device(f"cuda:{self.ltx_main_gpus[0]}")

	def get_ltx_vae_device(self) -> torch.device:
	"""Returns the dedicated device for the LTX VAE."""
	if not self.ltx_vae_gpu:
	return torch.device("cpu")
	return torch.device(f"cuda:{self.ltx_vae_gpu[0]}")

	def get_seedvr_devices(self) -> List[int]:
	"""Returns the list of GPU indices for the SeedVR service."""
	return self.seedvr_gpus

	def get_vincie_devices(self) -> List[int]:
	"""Returns the list of GPU indices for the VINCIE service."""
	return self.vincie_gpus

	def requires_memory_swap(self) -> bool:
	"""
	Determines if memory swapping is necessary because multiple services
	are sharing the same primary GPU.
	The dedicated VAE GPU is not considered for swapping logic.
	"""
	# Collect all GPUs used by the main, memory-intensive parts of the services
	all_main_allocations = self.ltx_main_gpus + self.seedvr_gpus + self.vincie_gpus

	# Count how many services are allocated to each unique GPU
	gpu_usage_count = {}
	for gpu_idx in all_main_allocations:
	gpu_usage_count[gpu_idx] = gpu_usage_count.get(gpu_idx, 0) + 1

	# Swapping is required if any GPU is used by more than one service
	for gpu_idx in gpu_usage_count:
	if gpu_usage_count[gpu_idx] > 1:
	logging.warning(f"Memory swapping is ACTIVE because GPU {gpu_idx} is shared by multiple services.")
	return True

	logging.info("Memory swapping is INACTIVE. Each service has dedicated primary GPUs.")
	return False

	# --- Singleton Instantiation ---
	# This global instance is created once and imported by all other modules.
	gpu_manager = GPUManager()