aducsdr commited on
Commit
c99f4f7
·
verified ·
1 Parent(s): 2bfde6e

Update aduc_framework/managers/mmaudio_manager.py

Browse files
aduc_framework/managers/mmaudio_manager.py CHANGED
@@ -1,23 +1,12 @@
1
  # managers/mmaudio_manager.py
2
- # AducSdr: Uma implementação aberta e funcional da arquitetura ADUC-SDR
3
- # Copyright (C) 4 de Agosto de 2025 Carlos Rodrigues dos Santos
4
  #
5
- # Contato:
6
- # Carlos Rodrigues dos Santos
7
- # carlex22@gmail.com
8
- # Rua Eduardo Carlos Pereira, 4125, B1 Ap32, Curitiba, PR, Brazil, CEP 8102025
9
  #
10
- # Repositórios e Projetos Relacionados:
11
- # GitHub: https://github.com/carlex22/Aduc-sdr
12
  #
13
- # PENDING PATENT NOTICE: Please see NOTICE.md.
14
- #
15
- # Version: 2.3.0
16
- #
17
- # This file defines the MMAudioManager for the ADUC-SDR framework. It is responsible
18
- # for generating audio synchronized with video clips. This version has been refactored
19
- # to be self-contained by automatically cloning the MMAudio dependency from its
20
- # official repository, making the framework more portable and easier to set up.
21
 
22
  import torch
23
  import logging
@@ -26,183 +15,212 @@ import os
26
  import time
27
  import yaml
28
  import gc
 
29
  from pathlib import Path
30
  import gradio as gr
31
  import sys
32
 
 
 
 
33
  logger = logging.getLogger(__name__)
34
 
35
- # --- Dependency Management ---
36
  DEPS_DIR = Path("./deps")
37
  MMAUDIO_REPO_DIR = DEPS_DIR / "MMAudio"
38
  MMAUDIO_REPO_URL = "https://github.com/hkchengrex/MMAudio.git"
39
 
40
- def setup_mmaudio_dependencies():
41
- """
42
- Ensures the MMAudio repository is cloned and available in the sys.path.
43
- This function is run once when the module is first imported.
44
- """
45
- if not MMAUDIO_REPO_DIR.exists():
46
- logger.info(f"MMAudio repository not found at '{MMAUDIO_REPO_DIR}'. Cloning from GitHub...")
47
- try:
48
- DEPS_DIR.mkdir(exist_ok=True)
49
- subprocess.run(
50
- ["git", "clone", "--depth", "1", MMAUDIO_REPO_URL, str(MMAUDIO_REPO_DIR)],
51
- check=True, capture_output=True, text=True
52
- )
53
- logger.info("MMAudio repository cloned successfully.")
54
- except subprocess.CalledProcessError as e:
55
- logger.error(f"Failed to clone MMAudio repository. Git stderr: {e.stderr}")
56
- raise RuntimeError("Could not clone the required MMAudio dependency from GitHub.")
57
- else:
58
- logger.info("Found local MMAudio repository.")
59
-
60
- if str(MMAUDIO_REPO_DIR.resolve()) not in sys.path:
61
- sys.path.insert(0, str(MMAUDIO_REPO_DIR.resolve()))
62
- logger.info(f"Added '{MMAUDIO_REPO_DIR.resolve()}' to sys.path.")
63
-
64
- setup_mmaudio_dependencies()
65
-
66
- from mmaudio.eval_utils import ModelConfig, all_model_cfg, generate as mmaudio_generate, load_video, make_video
67
- from mmaudio.model.flow_matching import FlowMatching
68
- from mmaudio.model.networks import MMAudio, get_my_mmaudio
69
- from mmaudio.model.utils.features_utils import FeaturesUtils
70
- from mmaudio.model.sequence_config import SequenceConfig
71
-
72
-
73
- class MMAudioManager:
74
- """
75
- Manages the MMAudio model for audio generation tasks.
76
- """
77
- def __init__(self, workspace_dir):
78
- self.device = "cuda" if torch.cuda.is_available() else "cpu"
79
  self.cpu_device = torch.device("cpu")
80
- self.dtype = torch.bfloat16 if self.device == "cuda" else torch.float32
81
- self.workspace_dir = workspace_dir
82
 
83
- self.all_model_cfg = all_model_cfg
84
- self.model_config: 'ModelConfig' = self.all_model_cfg['large_44k_v2']
85
  self.net: 'MMAudio' = None
86
  self.feature_utils: 'FeaturesUtils' = None
87
  self.seq_cfg: 'SequenceConfig' = None
 
88
 
89
- self._load_models_to_cpu()
90
-
91
- def _adjust_paths_for_repo(self):
92
- """Adjusts the checkpoint paths in the model config to point inside the cloned repo."""
93
- for cfg_key in self.all_model_cfg:
94
- cfg = self.all_model_cfg[cfg_key]
95
- # The paths in the original config are relative, so we join them with our repo path
96
- cfg.model_path = MMAUDIO_REPO_DIR / cfg.model_path
97
- cfg.vae_path = MMAUDIO_REPO_DIR / cfg.vae_path
98
- if cfg.bigvgan_16k_path is not None:
99
- cfg.bigvgan_16k_path = MMAUDIO_REPO_DIR / cfg.bigvgan_16k_path
100
- cfg.synchformer_ckpt = MMAUDIO_REPO_DIR / cfg.synchformer_ckpt
101
-
102
- def _load_models_to_cpu(self):
103
- """Loads the MMAudio models to CPU memory on initialization."""
104
- try:
105
- self._adjust_paths_for_repo()
106
- logger.info("Verifying and downloading MMAudio models, if necessary...")
107
- self.model_config.download_if_needed()
108
-
109
- self.seq_cfg = self.model_config.seq_cfg
110
-
111
- logger.info(f"Loading MMAudio model: {self.model_config.model_name} to CPU...")
112
- self.net = get_my_mmaudio(self.model_config.model_name).eval()
113
- self.net.load_weights(torch.load(self.model_config.model_path, map_location=self.cpu_device, weights_only=True))
114
-
115
- logger.info("Loading MMAudio feature utils to CPU...")
116
- self.feature_utils = FeaturesUtils(
117
- tod_vae_ckpt=self.model_config.vae_path,
118
- synchformer_ckpt=self.model_config.synchformer_ckpt,
119
- enable_conditions=True,
120
- mode=self.model_config.mode,
121
- bigvgan_vocoder_ckpt=self.model_config.bigvgan_16k_path,
122
- need_vae_encoder=False
123
- )
124
- self.feature_utils = self.feature_utils.eval()
125
- self.net.to(self.cpu_device)
126
- self.feature_utils.to(self.cpu_device)
127
- logger.info("MMAudioManager ready on CPU.")
128
- except Exception as e:
129
- logger.error(f"Failed to load audio models: {e}", exc_info=True)
130
- self.net = None
131
 
132
- def to_gpu(self):
133
- """Moves the models and utilities to the GPU before inference."""
134
- if self.device == 'cpu': return
135
- logger.info(f"Moving MMAudioManager to GPU ({self.device})...")
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
136
  self.net.to(self.device, self.dtype)
137
  self.feature_utils.to(self.device, self.dtype)
 
138
 
139
- def to_cpu(self):
140
- """Moves the models back to CPU and clears VRAM after inference."""
141
- if self.device == 'cpu': return
142
- logger.info("Unloading MMAudioManager from GPU...")
143
  self.net.to(self.cpu_device)
144
  self.feature_utils.to(self.cpu_device)
 
 
145
  gc.collect()
146
  if torch.cuda.is_available(): torch.cuda.empty_cache()
147
 
148
- def generate_audio_for_video(self, video_path: str, prompt: str, duration_seconds: float, output_path_override: str = None) -> str:
149
- """
150
- Generates audio for a video file, applying a negative prompt to avoid speech.
151
- """
152
- if self.net is None:
153
- raise gr.Error("MMAudio model is not loaded. Cannot generate audio.")
154
-
155
- logger.info("--- Generating Audio for Video Fragment ---")
156
- logger.info(f"--- Video: {os.path.basename(video_path)}")
157
- logger.info(f"--- Duration: {duration_seconds:.2f}s")
158
-
159
  negative_prompt = "human voice, speech, talking, singing, narration"
160
- logger.info(f"--- Prompt: '{prompt}' | Negative Prompt: '{negative_prompt}'")
 
161
 
162
- if duration_seconds < 1:
163
- logger.warning("Fragment too short (<1s). Returning original video.")
164
- return video_path
165
 
166
- if self.device == 'cpu':
167
- logger.warning("Generating audio on CPU. This may be very slow.")
 
 
 
 
 
 
 
 
 
168
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
169
  try:
170
- self.to_gpu()
171
- with torch.no_grad():
172
- rng = torch.Generator(device=self.device).manual_seed(int(time.time()))
173
- fm = FlowMatching(min_sigma=0, inference_mode='euler', num_steps=25)
174
 
175
- video_info = load_video(Path(video_path), duration_seconds)
176
- self.seq_cfg.duration = video_info.duration_sec
177
- self.net.update_seq_lengths(self.seq_cfg.latent_seq_len, self.seq_cfg.clip_seq_len, self.seq_cfg.sync_seq_len)
178
-
179
- audios = mmaudio_generate(
180
- clip_video=video_info.clip_frames.unsqueeze(0),
181
- sync_video=video_info.sync_frames.unsqueeze(0),
182
- text=[prompt],
183
- negative_text=[negative_prompt],
184
- feature_utils=self.feature_utils,
185
- net=self.net,
186
- fm=fm,
187
- rng=rng,
188
- cfg_strength=4.5
189
- )
190
- audio_waveform = audios.float().cpu()[0]
191
-
192
- output_video_path = output_path_override if output_path_override else os.path.join(self.workspace_dir, f"{Path(video_path).stem}_with_audio.mp4")
193
 
194
- make_video(video_info, Path(output_video_path), audio_waveform, sampling_rate=self.seq_cfg.sampling_rate)
195
- logger.info(f"--- Fragment with audio saved to: {os.path.basename(output_video_path)}")
196
- return output_video_path
197
- finally:
198
- self.to_cpu()
 
 
 
 
 
 
 
 
 
 
 
 
 
 
199
 
200
- # --- Singleton Instantiation ---
201
  try:
202
  with open("config.yaml", 'r') as f:
203
  config = yaml.safe_load(f)
204
  WORKSPACE_DIR = config['application']['workspace_dir']
205
- mmaudio_manager_singleton = MMAudioManager(workspace_dir=WORKSPACE_DIR)
 
 
 
 
 
 
 
 
 
206
  except Exception as e:
207
- logger.error(f"Could not initialize MMAudioManager: {e}", exc_info=True)
208
- mmaudio_manager_singleton = None
 
1
  # managers/mmaudio_manager.py
 
 
2
  #
3
+ # Copyright (C) 2025 Carlos Rodrigues dos Santos
 
 
 
4
  #
5
+ # Version: 3.0.0 (GPU Pool Manager)
 
6
  #
7
+ # Esta versão refatora o MMAudioManager para um modelo de Pool com Workers,
8
+ # permitindo o uso de múltiplas GPUs dedicadas para a geração de áudio
9
+ # com um sistema de rodízio para gerenciamento eficiente de VRAM.
 
 
 
 
 
10
 
11
  import torch
12
  import logging
 
15
  import time
16
  import yaml
17
  import gc
18
+ import threading
19
  from pathlib import Path
20
  import gradio as gr
21
  import sys
22
 
23
+ # Imports relativos para o hardware_manager
24
+ from ..tools.hardware_manager import hardware_manager
25
+
26
  logger = logging.getLogger(__name__)
27
 
28
+ # --- Gerenciamento de Dependências ---
29
  DEPS_DIR = Path("./deps")
30
  MMAUDIO_REPO_DIR = DEPS_DIR / "MMAudio"
31
  MMAUDIO_REPO_URL = "https://github.com/hkchengrex/MMAudio.git"
32
 
33
+ # Lazy-loaded imports
34
+ ModelConfig, all_model_cfg, mmaudio_generate, load_video, make_video = None, None, None, None, None
35
+ MMAudio, get_my_mmaudio = None, None
36
+ FeaturesUtils = None
37
+ SequenceConfig = None
38
+ FlowMatching = None
39
+
40
+ class MMAudioWorker:
41
+ """Representa uma única instância do pipeline MMAudio em um dispositivo."""
42
+ def __init__(self, device_id: str):
43
+ self.device = torch.device(device_id)
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
44
  self.cpu_device = torch.device("cpu")
45
+ self.dtype = torch.bfloat16 if 'cuda' in self.device.type else torch.float32
 
46
 
 
 
47
  self.net: 'MMAudio' = None
48
  self.feature_utils: 'FeaturesUtils' = None
49
  self.seq_cfg: 'SequenceConfig' = None
50
+ self.model_config: 'ModelConfig' = None
51
 
52
+ self._check_and_run_global_setup()
53
+ self._lazy_load_mmaudio_modules()
54
+ logger.info(f"MMAudio Worker inicializado para o dispositivo {self.device}.")
55
+
56
+ def _lazy_load_mmaudio_modules(self):
57
+ """Importa dinamicamente os módulos do MMAudio."""
58
+ global ModelConfig, all_model_cfg, mmaudio_generate, load_video, make_video, MMAudio, get_my_mmaudio, FeaturesUtils, SequenceConfig, FlowMatching
59
+ if MMAudio is not None: return
60
+
61
+ from mmaudio.eval_utils import ModelConfig, all_model_cfg, generate as mmaudio_generate, load_video, make_video
62
+ from mmaudio.model.flow_matching import FlowMatching
63
+ from mmaudio.model.networks import MMAudio, get_my_mmaudio
64
+ from mmaudio.model.utils.features_utils import FeaturesUtils
65
+ from mmaudio.model.sequence_config import SequenceConfig
66
+ logger.info("Módulos do MMAudio foram carregados dinamicamente.")
67
+
68
+ @staticmethod
69
+ def _check_and_run_global_setup():
70
+ """Executa o setup de clonagem do repositório e download de modelos uma única vez."""
71
+ setup_flag = DEPS_DIR / "mmaudio.setup.complete"
72
+ if setup_flag.exists():
73
+ return True
74
+
75
+ logger.info("--- Iniciando Setup Global do MMAudio (primeira execução) ---")
76
+ if not MMAUDIO_REPO_DIR.exists():
77
+ DEPS_DIR.mkdir(exist_ok=True)
78
+ subprocess.run(["git", "clone", "--depth", "1", MMAUDIO_REPO_URL, str(MMAUDIO_REPO_DIR)], check=True)
79
+
80
+ if str(MMAUDIO_REPO_DIR.resolve()) not in sys.path:
81
+ sys.path.insert(0, str(MMAUDIO_REPO_DIR.resolve()))
 
 
 
 
 
 
 
 
 
 
 
 
82
 
83
+ # Importar após adicionar ao path
84
+ from mmaudio.eval_utils import all_model_cfg as cfg
85
+
86
+ # Ajustar caminhos e baixar modelos
87
+ for cfg_key in cfg:
88
+ config = cfg[cfg_key]
89
+ config.model_path = MMAUDIO_REPO_DIR / config.model_path
90
+ config.vae_path = MMAUDIO_REPO_DIR / config.vae_path
91
+ if config.bigvgan_16k_path:
92
+ config.bigvgan_16k_path = MMAUDIO_REPO_DIR / config.bigvgan_16k_path
93
+ config.synchformer_ckpt = MMAUDIO_REPO_DIR / config.synchformer_ckpt
94
+ config.download_if_needed()
95
+
96
+ setup_flag.touch()
97
+ logger.info("--- Setup Global do MMAudio Concluído ---")
98
+ return True
99
+
100
+ def initialize_models(self):
101
+ """Carrega os modelos do worker para a CPU e depois para a GPU designada."""
102
+ if self.net is not None: return
103
+
104
+ self.model_config = all_model_cfg['large_44k_v2']
105
+ self.seq_cfg = self.model_config.seq_cfg
106
+
107
+ logger.info(f"Worker {self.device}: Carregando modelo MMAudio para a CPU...")
108
+ self.net = get_my_mmaudio(self.model_config.model_name).eval()
109
+ self.net.load_weights(torch.load(self.model_config.model_path, map_location=self.cpu_device, weights_only=True))
110
+
111
+ self.feature_utils = FeaturesUtils(
112
+ tod_vae_ckpt=self.model_config.vae_path,
113
+ synchformer_ckpt=self.model_config.synchformer_ckpt,
114
+ enable_conditions=True, mode=self.model_config.mode,
115
+ bigvgan_vocoder_ckpt=self.model_config.bigvgan_16k_path,
116
+ need_vae_encoder=False
117
+ ).eval()
118
+
119
  self.net.to(self.device, self.dtype)
120
  self.feature_utils.to(self.device, self.dtype)
121
+ logger.info(f"Worker {self.device}: Modelos MMAudio prontos na VRAM.")
122
 
123
+ def unload_models(self):
124
+ """Descarrega os modelos da VRAM, movendo-os para a CPU."""
125
+ if self.net is None: return
126
+ logger.info(f"Worker {self.device}: Descarregando modelos MMAudio da VRAM...")
127
  self.net.to(self.cpu_device)
128
  self.feature_utils.to(self.cpu_device)
129
+ del self.net, self.feature_utils, self.seq_cfg, self.model_config
130
+ self.net, self.feature_utils, self.seq_cfg, self.model_config = None, None, None, None
131
  gc.collect()
132
  if torch.cuda.is_available(): torch.cuda.empty_cache()
133
 
134
+ def generate_audio_internal(self, video_path: str, prompt: str, duration_seconds: float, output_path: str) -> str:
135
+ """Lógica de geração de áudio que roda na GPU do worker."""
 
 
 
 
 
 
 
 
 
136
  negative_prompt = "human voice, speech, talking, singing, narration"
137
+ rng = torch.Generator(device=self.device).manual_seed(int(time.time()))
138
+ fm = FlowMatching(min_sigma=0, inference_mode='euler', num_steps=25)
139
 
140
+ video_info = load_video(Path(video_path), duration_seconds)
141
+ self.seq_cfg.duration = video_info.duration_sec
142
+ self.net.update_seq_lengths(self.seq_cfg.latent_seq_len, self.seq_cfg.clip_seq_len, self.seq_cfg.sync_seq_len)
143
 
144
+ with torch.no_grad():
145
+ audios = mmaudio_generate(
146
+ clip_video=video_info.clip_frames.unsqueeze(0).to(self.device, self.dtype),
147
+ sync_video=video_info.sync_frames.unsqueeze(0).to(self.device, self.dtype),
148
+ text=[prompt], negative_text=[negative_prompt],
149
+ feature_utils=self.feature_utils, net=self.net, fm=fm, rng=rng, cfg_strength=4.5
150
+ )
151
+ audio_waveform = audios.float().cpu()[0]
152
+
153
+ make_video(video_info, Path(output_path), audio_waveform, sampling_rate=self.seq_cfg.sampling_rate)
154
+ return output_path
155
 
156
+ class MMAudioPoolManager:
157
+ def __init__(self, device_ids: list[str], workspace_dir: str):
158
+ logger.info(f"MMAUDIO POOL MANAGER: Criando workers para os dispositivos: {device_ids}")
159
+ self.workspace_dir = workspace_dir
160
+ if not device_ids or 'cpu' in device_ids:
161
+ raise ValueError("MMAudioPoolManager requer GPUs dedicadas.")
162
+ self.workers = [MMAudioWorker(device_id) for device_id in device_ids]
163
+ self.current_worker_index = 0
164
+ self.lock = threading.Lock()
165
+ self.last_cleanup_thread = None
166
+
167
+ def _cleanup_worker_thread(self, worker: MMAudioWorker):
168
+ logger.info(f"MMAUDIO CLEANUP THREAD: Iniciando limpeza de {worker.device} em background...")
169
+ worker.unload_models()
170
+
171
+ def generate_audio_for_video(self, video_path: str, prompt: str, duration_seconds: float, output_path_override: str = None) -> str:
172
+ if duration_seconds < 1:
173
+ logger.warning(f"Vídeo muito curto ({duration_seconds:.2f}s). Pulando geração de áudio.")
174
+ return video_path
175
+
176
+ worker_to_use = None
177
  try:
178
+ with self.lock:
179
+ if self.last_cleanup_thread and self.last_cleanup_thread.is_alive():
180
+ self.last_cleanup_thread.join()
 
181
 
182
+ worker_to_use = self.workers[self.current_worker_index]
183
+ previous_worker_index = (self.current_worker_index - 1 + len(self.workers)) % len(self.workers)
184
+ worker_to_cleanup = self.workers[previous_worker_index]
185
+
186
+ cleanup_thread = threading.Thread(target=self._cleanup_worker_thread, args=(worker_to_cleanup,))
187
+ cleanup_thread.start()
188
+ self.last_cleanup_thread = cleanup_thread
 
 
 
 
 
 
 
 
 
 
 
189
 
190
+ worker_to_use.initialize_models()
191
+ self.current_worker_index = (self.current_worker_index + 1) % len(self.workers)
192
+
193
+ logger.info(f"MMAUDIO POOL MANAGER: Gerando áudio em {worker_to_use.device}...")
194
+
195
+ output_path = output_path_override or os.path.join(self.workspace_dir, f"{Path(video_path).stem}_with_audio.mp4")
196
+
197
+ return worker_to_use.generate_audio_internal(
198
+ video_path=video_path, prompt=prompt, duration_seconds=duration_seconds, output_path=output_path
199
+ )
200
+ except Exception as e:
201
+ logger.error(f"MMAUDIO POOL MANAGER: Erro durante a geração de áudio: {e}", exc_info=True)
202
+ raise gr.Error(f"Falha na geração de áudio: {e}")
203
+
204
+ # --- Instanciação Singleton ---
205
+ class MMAudioPlaceholder:
206
+ def generate_audio_for_video(self, video_path, *args, **kwargs):
207
+ logger.error("MMAudio não foi inicializado pois nenhuma GPU foi alocada. Pulando etapa de áudio.")
208
+ return video_path
209
 
 
210
  try:
211
  with open("config.yaml", 'r') as f:
212
  config = yaml.safe_load(f)
213
  WORKSPACE_DIR = config['application']['workspace_dir']
214
+
215
+ mmaudio_gpus_required = config['specialists'].get('mmaudio', {}).get('gpus_required', 0)
216
+ mmaudio_device_ids = hardware_manager.allocate_gpus('MMAudio', mmaudio_gpus_required)
217
+
218
+ if mmaudio_gpus_required > 0 and 'cpu' not in mmaudio_device_ids:
219
+ mmaudio_manager_singleton = MMAudioPoolManager(device_ids=mmaudio_device_ids, workspace_dir=WORKSPACE_DIR)
220
+ logger.info("Especialista de Áudio (MMAudio Pool) pronto.")
221
+ else:
222
+ mmaudio_manager_singleton = MMAudioPlaceholder()
223
+ logger.warning("MMAudio Pool Manager não foi inicializado. Nenhuma GPU foi requisitada na config.yaml.")
224
  except Exception as e:
225
+ logger.critical(f"Falha CRÍTICA ao inicializar o MMAudioManager: {e}", exc_info=True)
226
+ mmaudio_manager_singleton = MMAudioPlaceholder()