EuuIia commited on
Commit
cb3f487
·
verified ·
1 Parent(s): 31d7902

Update video_service.py

Browse files
Files changed (1) hide show
  1. video_service.py +104 -101
video_service.py CHANGED
@@ -17,6 +17,110 @@ import subprocess
17
 
18
  # --- 2. GERENCIAMENTO DE DEPENDÊNCIAS E SETUP ---
19
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
20
  def run_setup():
21
  """Executa o script setup.py para clonar as dependências necessárias."""
22
  setup_script_path = "setup.py"
@@ -150,107 +254,6 @@ class VideoService:
150
  except Exception:
151
  pass
152
 
153
- def _query_gpu_processes_via_nvml(device_index: int) -> List[Dict]:
154
- try:
155
- import psutil
156
- import pynvml as nvml
157
- nvml.nvmlInit()
158
- handle = nvml.nvmlDeviceGetHandleByIndex(device_index)
159
- # Try v3, then fall back to the generic name if binding differs
160
- try:
161
- procs = nvml.nvmlDeviceGetComputeRunningProcesses_v3(handle)
162
- except Exception:
163
- procs = nvml.nvmlDeviceGetComputeRunningProcesses(handle)
164
- results = []
165
- for p in procs:
166
- pid = int(p.pid)
167
- used_mb = None
168
- try:
169
- # NVML returns bytes; some bindings may use NVML_VALUE_NOT_AVAILABLE
170
- if getattr(p, "usedGpuMemory", None) is not None and p.usedGpuMemory not in (0,):
171
- used_mb = max(0, int(p.usedGpuMemory) // (1024 * 1024))
172
- except Exception:
173
- used_mb = None
174
- name = "unknown"
175
- user = "unknown"
176
- try:
177
- pr = psutil.Process(pid)
178
- name = pr.name()
179
- user = pr.username()
180
- except Exception:
181
- pass
182
- results.append({"pid": pid, "name": name, "user": user, "used_mb": used_mb})
183
- nvml.nvmlShutdown()
184
- return results
185
- except Exception:
186
- return []
187
-
188
- def _query_gpu_processes_via_nvidiasmi(device_index: int) -> List[Dict]:
189
- # CSV, no header, no units gives lines: "PID,process_name,used_memory"
190
- cmd = f"nvidia-smi -i {device_index} --query-compute-apps=pid,process_name,used_memory --format=csv,noheader,nounits"
191
- try:
192
- out = subprocess.check_output(shlex.split(cmd), stderr=subprocess.STDOUT, text=True, timeout=2.0)
193
- except Exception:
194
- return []
195
- results = []
196
- for line in out.strip().splitlines():
197
- parts = [p.strip() for p in line.split(",")]
198
- if len(parts) >= 3:
199
- try:
200
- pid = int(parts[0])
201
- name = parts[1]
202
- used_mb = int(parts[2])
203
- user = "unknown"
204
- try:
205
- import psutil
206
- pr = psutil.Process(pid)
207
- user = pr.username()
208
- except Exception:
209
- pass
210
- results.append({"pid": pid, "name": name, "user": user, "used_mb": used_mb})
211
- except Exception:
212
- continue
213
- return results
214
-
215
- def _gpu_process_table(processes: List[Dict], current_pid: int) -> str:
216
- if not processes:
217
- return " - Processos ativos: (nenhum)\n"
218
- # sort by used_mb desc, then pid
219
- processes = sorted(processes, key=lambda x: (x.get("used_mb") or 0), reverse=True)
220
- lines = [" - Processos ativos (PID | USER | NAME | VRAM MB):"]
221
- for p in processes:
222
- star = "*" if p["pid"] == current_pid else " "
223
- used_str = str(p["used_mb"]) if p.get("used_mb") is not None else "N/A"
224
- lines.append(f" {star} {p['pid']} | {p['user']} | {p['name']} | {used_str}")
225
- return "\n".join(lines) + "\n"
226
-
227
- # Integração no método existente:
228
- def _log_gpu_memory(self, stage_name: str):
229
- import torch
230
- if self.device != "cuda":
231
- return
232
- device_index = torch.cuda.current_device() if torch.cuda.is_available() else 0
233
- current_reserved_b = torch.cuda.memory_reserved(device_index)
234
- current_reserved_mb = current_reserved_b / (1024 ** 2)
235
- total_memory_b = torch.cuda.get_device_properties(device_index).total_memory
236
- total_memory_mb = total_memory_b / (1024 ** 2)
237
- peak_reserved_mb = torch.cuda.max_memory_reserved(device_index) / (1024 ** 2)
238
- delta_mb = current_reserved_mb - getattr(self, "last_memory_reserved_mb", 0.0)
239
-
240
- # Coleta de processos: tenta NVML, depois fallback para nvidia-smi
241
- processes = _query_gpu_processes_via_nvml(device_index)
242
- if not processes:
243
- processes = _query_gpu_processes_via_nvidiasmi(device_index)
244
-
245
- print(f"\n--- [LOG DE MEMÓRIA GPU] - {stage_name} (cuda:{device_index}) ---")
246
- print(f" - Uso Atual (Reservado): {current_reserved_mb:.2f} MB / {total_memory_mb:.2f} MB")
247
- print(f" - Variação desde o último log: {delta_mb:+.2f} MB")
248
- if peak_reserved_mb > getattr(self, "last_memory_reserved_mb", 0.0):
249
- print(f" - Pico de Uso (nesta operação): {peak_reserved_mb:.2f} MB")
250
- # Imprime tabela de processos
251
- print(_gpu_process_table(processes, os.getpid()), end="")
252
- print("--------------------------------------------------\n")
253
- self.last_memory_reserved_mb = current_reserved_mb
254
 
255
  def _load_config(self):
256
  config_file_path = LTX_VIDEO_REPO_DIR / "configs" / "ltxv-13b-0.9.8-distilled.yaml"
 
17
 
18
  # --- 2. GERENCIAMENTO DE DEPENDÊNCIAS E SETUP ---
19
 
20
+ def _query_gpu_processes_via_nvml(device_index: int) -> List[Dict]:
21
+ try:
22
+ import psutil
23
+ import pynvml as nvml
24
+ nvml.nvmlInit()
25
+ handle = nvml.nvmlDeviceGetHandleByIndex(device_index)
26
+ # Try v3, then fall back to the generic name if binding differs
27
+ try:
28
+ procs = nvml.nvmlDeviceGetComputeRunningProcesses_v3(handle)
29
+ except Exception:
30
+ procs = nvml.nvmlDeviceGetComputeRunningProcesses(handle)
31
+ results = []
32
+ for p in procs:
33
+ pid = int(p.pid)
34
+ used_mb = None
35
+ try:
36
+ # NVML returns bytes; some bindings may use NVML_VALUE_NOT_AVAILABLE
37
+ if getattr(p, "usedGpuMemory", None) is not None and p.usedGpuMemory not in (0,):
38
+ used_mb = max(0, int(p.usedGpuMemory) // (1024 * 1024))
39
+ except Exception:
40
+ used_mb = None
41
+ name = "unknown"
42
+ user = "unknown"
43
+ try:
44
+ pr = psutil.Process(pid)
45
+ name = pr.name()
46
+ user = pr.username()
47
+ except Exception:
48
+ pass
49
+ results.append({"pid": pid, "name": name, "user": user, "used_mb": used_mb})
50
+ nvml.nvmlShutdown()
51
+ return results
52
+ except Exception:
53
+ return []
54
+
55
+ def _query_gpu_processes_via_nvidiasmi(device_index: int) -> List[Dict]:
56
+ # CSV, no header, no units gives lines: "PID,process_name,used_memory"
57
+ cmd = f"nvidia-smi -i {device_index} --query-compute-apps=pid,process_name,used_memory --format=csv,noheader,nounits"
58
+ try:
59
+ out = subprocess.check_output(shlex.split(cmd), stderr=subprocess.STDOUT, text=True, timeout=2.0)
60
+ except Exception:
61
+ return []
62
+ results = []
63
+ for line in out.strip().splitlines():
64
+ parts = [p.strip() for p in line.split(",")]
65
+ if len(parts) >= 3:
66
+ try:
67
+ pid = int(parts[0])
68
+ name = parts[1]
69
+ used_mb = int(parts[2])
70
+ user = "unknown"
71
+ try:
72
+ import psutil
73
+ pr = psutil.Process(pid)
74
+ user = pr.username()
75
+ except Exception:
76
+ pass
77
+ results.append({"pid": pid, "name": name, "user": user, "used_mb": used_mb})
78
+ except Exception:
79
+ continue
80
+ return results
81
+
82
+ def _gpu_process_table(processes: List[Dict], current_pid: int) -> str:
83
+ if not processes:
84
+ return " - Processos ativos: (nenhum)\n"
85
+ # sort by used_mb desc, then pid
86
+ processes = sorted(processes, key=lambda x: (x.get("used_mb") or 0), reverse=True)
87
+ lines = [" - Processos ativos (PID | USER | NAME | VRAM MB):"]
88
+ for p in processes:
89
+ star = "*" if p["pid"] == current_pid else " "
90
+ used_str = str(p["used_mb"]) if p.get("used_mb") is not None else "N/A"
91
+ lines.append(f" {star} {p['pid']} | {p['user']} | {p['name']} | {used_str}")
92
+ return "\n".join(lines) + "\n"
93
+
94
+ # Integração no método existente:
95
+ def _log_gpu_memory(self, stage_name: str):
96
+ import torch
97
+ if self.device != "cuda":
98
+ return
99
+ device_index = torch.cuda.current_device() if torch.cuda.is_available() else 0
100
+ current_reserved_b = torch.cuda.memory_reserved(device_index)
101
+ current_reserved_mb = current_reserved_b / (1024 ** 2)
102
+ total_memory_b = torch.cuda.get_device_properties(device_index).total_memory
103
+ total_memory_mb = total_memory_b / (1024 ** 2)
104
+ peak_reserved_mb = torch.cuda.max_memory_reserved(device_index) / (1024 ** 2)
105
+ delta_mb = current_reserved_mb - getattr(self, "last_memory_reserved_mb", 0.0)
106
+
107
+ # Coleta de processos: tenta NVML, depois fallback para nvidia-smi
108
+ processes = _query_gpu_processes_via_nvml(device_index)
109
+ if not processes:
110
+ processes = _query_gpu_processes_via_nvidiasmi(device_index)
111
+
112
+ print(f"\n--- [LOG DE MEMÓRIA GPU] - {stage_name} (cuda:{device_index}) ---")
113
+ print(f" - Uso Atual (Reservado): {current_reserved_mb:.2f} MB / {total_memory_mb:.2f} MB")
114
+ print(f" - Variação desde o último log: {delta_mb:+.2f} MB")
115
+ if peak_reserved_mb > getattr(self, "last_memory_reserved_mb", 0.0):
116
+ print(f" - Pico de Uso (nesta operação): {peak_reserved_mb:.2f} MB")
117
+ # Imprime tabela de processos
118
+ print(_gpu_process_table(processes, os.getpid()), end="")
119
+ print("--------------------------------------------------\n")
120
+ self.last_memory_reserved_mb = current_reserved_mb
121
+
122
+
123
+
124
  def run_setup():
125
  """Executa o script setup.py para clonar as dependências necessárias."""
126
  setup_script_path = "setup.py"
 
254
  except Exception:
255
  pass
256
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
257
 
258
  def _load_config(self):
259
  config_file_path = LTX_VIDEO_REPO_DIR / "configs" / "ltxv-13b-0.9.8-distilled.yaml"