MogensR commited on
Commit
ee9591a
·
1 Parent(s): 8ac347b

Update processing/video/video_processor.py

Browse files
Files changed (1) hide show
  1. processing/video/video_processor.py +88 -76
processing/video/video_processor.py CHANGED
@@ -1,21 +1,15 @@
1
  #!/usr/bin/env python3
2
  """
3
- Compatibility shim: CoreVideoProcessor
4
 
5
- Stability features:
6
- - Temporal EMA smoothing of masks (alpha → previous; higher = calmer)
7
- - IoU outlier rejection (skip sudden mask jumps vs previous mask)
8
- - Edge padding (dilate) + small edge blur to calm shimmering edges
9
-
10
- Other features:
11
  - Accepts background configs:
12
  {"custom_path": "/path/to/image.png"}
13
  {"background_choice": "<preset_key>"}
14
  {"gradient": {type, start, end, angle_deg}}
15
  - Model-only downscale (max_model_size) for speed, full-res render.
16
- - FFmpeg pipe writer with encoder fallbacks (NVENC/libx264/mpeg4) and
17
- stderr surfacing; falls back to OpenCV VideoWriter if FFmpeg isn't
18
- available or fails mid-run.
19
 
20
  Requirements for the models provider:
21
  - get_sam2() -> predictor or None
@@ -99,16 +93,7 @@ class ProcessorConfig:
99
  # Model-only downscale (speedup without changing output resolution)
100
  max_model_size: Optional[int] = 1280
101
 
102
- # ------------------ Stability knobs ------------------
103
- # EMA: smoothed = alpha*prev + (1-alpha)*current (higher alpha = calmer)
104
- temporal_ema_alpha: float = 0.75 # 0.6–0.85 typical
105
- # Reject frames whose mask jumps too much vs previous (IoU threshold)
106
- min_iou_to_accept: float = 0.05 # 0.0 disables rejection
107
- # Edge padding + blur (in pixels) to reduce edge shimmer
108
- dilate_px: int = 6 # 0 disables
109
- edge_blur_px: int = 2 # 0 disables
110
-
111
- # ------------------ Encoding -------------------------
112
  use_nvenc: bool = True
113
  nvenc_codec: str = "h264" # "h264" or "hevc"
114
  nvenc_preset: str = "p5" # NVENC preset string
@@ -123,6 +108,17 @@ class ProcessorConfig:
123
 
124
  movflags_faststart: bool = True
125
 
 
 
 
 
 
 
 
 
 
 
 
126
 
127
  class _FFmpegPipe:
128
  """
@@ -139,7 +135,7 @@ def __init__(self, width: int, height: int, fps: float, out_path: str, cfg: Proc
139
 
140
  self.proc: Optional[subprocess.Popen] = None
141
  self.encoder_used: Optional[str] = None
142
- self._stderr: Optional[bytes] = None
143
 
144
  self._ffmpeg = shutil.which("ffmpeg")
145
  if not self._ffmpeg:
@@ -248,6 +244,7 @@ def write(self, frame_bgr: np.ndarray):
248
  if frame_bgr.shape[0] != self.height or frame_bgr.shape[1] != self.width:
249
  raise ValueError(f"Frame size mismatch. Expected {self.width}x{self.height}, got {frame_bgr.shape[1]}x{frame_bgr.shape[0]}.")
250
 
 
251
  frame_bgr = np.ascontiguousarray(frame_bgr)
252
  try:
253
  self.proc.stdin.write(frame_bgr.tobytes())
@@ -313,39 +310,66 @@ def __init__(self, config: Optional[ProcessorConfig] = None, models: Optional[An
313
  if self.models is None:
314
  self.log.warning("CoreVideoProcessor initialized without a models provider; will use fallbacks.")
315
  self._ffmpeg = shutil.which("ffmpeg")
316
- # temporal state
 
317
  self._prev_mask: Optional[np.ndarray] = None
318
 
319
- # ---------- utils: stability ----------
320
- @staticmethod
321
- def _mask_iou(a: np.ndarray, b: np.ndarray, thr: float = 0.5) -> float:
322
  a_bin = (a >= thr).astype(np.uint8)
323
  b_bin = (b >= thr).astype(np.uint8)
324
- inter = (a_bin & b_bin).sum(dtype=np.int64)
325
- union = (a_bin | b_bin).sum(dtype=np.int64)
326
- return float(inter) / float(union) if union > 0 else 1.0
327
-
328
- @staticmethod
329
- def _dilate_and_blur(mask01: np.ndarray, dilate_px: int, blur_px: int) -> np.ndarray:
330
- m = mask01
331
- if dilate_px and dilate_px > 0:
332
- k = max(1, int(dilate_px))
333
- kernel = cv2.getStructuringElement(cv2.MORPH_ELLIPSE, (2*k+1, 2*k+1))
334
- m = cv2.dilate((m*255).astype(np.uint8), kernel)
335
- m = m.astype(np.float32)/255.0
336
- if blur_px and blur_px > 0:
337
- k = max(1, int(blur_px)*2+1)
338
- m = cv2.GaussianBlur((m*255).astype(np.uint8), (k, k), 0).astype(np.float32)/255.0
 
 
 
 
 
 
 
 
 
 
 
339
  return np.clip(m, 0.0, 1.0)
340
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
341
  # ---------- Single frame ----------
342
  def process_frame(self, frame_bgr: np.ndarray, background_rgb: np.ndarray) -> Dict[str, Any]:
343
  """
344
  Process one frame:
345
  - optionally downscale for model work,
346
  - segment + refine,
 
347
  - upsample mask,
348
- - temporal smoothing + IoU rejection + edge padding/blur,
349
  - composite full-res.
350
  Returns dict with composited frame (BGR for writer) and mask (H,W float).
351
  """
@@ -362,7 +386,9 @@ def process_frame(self, frame_bgr: np.ndarray, background_rgb: np.ndarray) -> Di
362
  proc_frame_bgr = cv2.resize(frame_bgr, (newW, newH), interpolation=cv2.INTER_AREA)
363
  self.log.debug(f"Model-only downscale: {W}x{H} -> {newW}x{newH} (scale={scale:.3f})")
364
 
365
- # SAM2 predictor (if any)
 
 
366
  predictor = None
367
  try:
368
  if self.models and hasattr(self.models, "get_sam2"):
@@ -370,11 +396,10 @@ def process_frame(self, frame_bgr: np.ndarray, background_rgb: np.ndarray) -> Di
370
  except Exception as e:
371
  self.log.warning(f"SAM2 predictor unavailable: {e}")
372
 
373
- # 1) segmentation (with fallbacks inside)
374
- proc_frame_rgb_for_seg = cv2.cvtColor(proc_frame_bgr, cv2.COLOR_BGR2RGB)
375
- mask_small = segment_person_hq(proc_frame_rgb_for_seg, predictor, use_sam2=True)
376
 
377
- # 2) refinement (MatAnyOne if available, else robust OpenCV path)
378
  matanyone = None
379
  try:
380
  if self.models and hasattr(self.models, "get_matanyone"):
@@ -382,39 +407,25 @@ def process_frame(self, frame_bgr: np.ndarray, background_rgb: np.ndarray) -> Di
382
  except Exception as e:
383
  self.log.warning(f"MatAnyOne unavailable: {e}")
384
 
385
- # IMPORTANT: refine_mask_hq expects (frame, mask, ...). Give it *BGR* frame.
386
- mask_small_ref = refine_mask_hq(proc_frame_bgr, mask_small, matanyone=matanyone, use_matanyone=True)
 
 
 
 
 
387
 
388
  # Upsample mask back to full-res
389
  if scale != 1.0:
390
- mask_full = cv2.resize(mask_small_ref.astype(np.float32), (W, H), interpolation=cv2.INTER_LINEAR)
391
  else:
392
- mask_full = mask_small_ref.astype(np.float32)
393
-
394
- # ----- Stability pipeline -----
395
- # IoU rejection
396
- if self._prev_mask is not None and self.config.min_iou_to_accept > 0.0:
397
- iou = self._mask_iou(mask_full, self._prev_mask, thr=0.5)
398
- if iou < float(self.config.min_iou_to_accept):
399
- # jump detected → keep previous mask (skip update)
400
- mask_full = self._prev_mask
401
-
402
- # EMA smoothing (alpha→previous)
403
- if self._prev_mask is not None and 0.0 < float(self.config.temporal_ema_alpha) < 1.0:
404
- a = float(self.config.temporal_ema_alpha)
405
- mask_full = a * self._prev_mask + (1.0 - a) * mask_full
406
 
407
- # Edge padding + blur
408
- mask_full = self._dilate_and_blur(mask_full, self.config.dilate_px, self.config.edge_blur_px)
409
-
410
- # Update state
411
- self._prev_mask = mask_full
412
-
413
- # 3) compositing — pass RGB frame to match RGB background
414
  frame_rgb = cv2.cvtColor(frame_bgr, cv2.COLOR_BGR2RGB)
415
  out_rgb = replace_background_hq(frame_rgb, mask_full, background_rgb)
416
 
417
- # Convert back to BGR for writer
418
  out_bgr = cv2.cvtColor(out_rgb, cv2.COLOR_RGB2BGR)
419
  return {"frame": out_bgr, "mask": mask_full}
420
 
@@ -493,9 +504,12 @@ def process_video(
493
  # Background once (RGB)
494
  background_rgb = self._prepare_background_from_config(bg_config, width, height)
495
 
 
 
 
496
  # Writer selection
497
- ffmpeg_pipe: Optional[_FFmpegPipe] = None
498
- writer: Optional[cv2.VideoWriter] = None
499
  ffmpeg_failed_reason = None
500
 
501
  if self.config.use_nvenc and self._ffmpeg:
@@ -514,8 +528,6 @@ def process_video(
514
 
515
  frame_count = 0
516
  start_time = time.time()
517
- self._prev_mask = None # reset temporal state per video
518
-
519
  try:
520
  while True:
521
  ret, frame_bgr = cap.read()
@@ -536,7 +548,7 @@ def process_video(
536
  try:
537
  ffmpeg_pipe.write(out_bgr)
538
  except Exception as e:
539
- # Switch to OpenCV writer mid-run and continue (note: output will only contain frames from this point on)
540
  self.log.warning("Switching to OpenCV writer after FFmpeg error at frame %d: %s", frame_count, e)
541
  try:
542
  ffmpeg_pipe.close()
 
1
  #!/usr/bin/env python3
2
  """
3
+ Compatibility shim: CoreVideoProcessor (stabilized + crisper edges)
4
 
 
 
 
 
 
 
5
  - Accepts background configs:
6
  {"custom_path": "/path/to/image.png"}
7
  {"background_choice": "<preset_key>"}
8
  {"gradient": {type, start, end, angle_deg}}
9
  - Model-only downscale (max_model_size) for speed, full-res render.
10
+ - FFmpeg pipe writer with encoder fallbacks and stderr surfacing; falls back
11
+ to OpenCV VideoWriter if FFmpeg isn't available or fails mid-run.
12
+ - Temporal smoothing + mask hardening to avoid flicker/ghosting.
13
 
14
  Requirements for the models provider:
15
  - get_sam2() -> predictor or None
 
93
  # Model-only downscale (speedup without changing output resolution)
94
  max_model_size: Optional[int] = 1280
95
 
96
+ # FFmpeg / NVENC output (pipe). If disabled or unavailable, use OpenCV writer.
 
 
 
 
 
 
 
 
 
97
  use_nvenc: bool = True
98
  nvenc_codec: str = "h264" # "h264" or "hevc"
99
  nvenc_preset: str = "p5" # NVENC preset string
 
108
 
109
  movflags_faststart: bool = True
110
 
111
+ # ---------- stability & edge quality ----------
112
+ temporal_ema_alpha: float = 0.75 # higher = calmer (0.6–0.85 typical)
113
+ min_iou_to_accept: float = 0.05 # reject sudden mask jumps
114
+ dilate_px: int = 6 # pad edges to keep hair/ears/shoulders
115
+ edge_blur_px: int = 1 # tiny blur to calm edge shimmer
116
+
117
+ # hardening (turn soft mask into crisper 0/1)
118
+ hard_low: float = 0.35 # values below -> 0
119
+ hard_high: float = 0.70 # values above -> 1
120
+ mask_gamma: float = 0.90 # <1 boosts mid-tones slightly
121
+
122
 
123
  class _FFmpegPipe:
124
  """
 
135
 
136
  self.proc: Optional[subprocess.Popen] = None
137
  self.encoder_used: Optional[str] = None
138
+ self._stderr: bytes | None = None
139
 
140
  self._ffmpeg = shutil.which("ffmpeg")
141
  if not self._ffmpeg:
 
244
  if frame_bgr.shape[0] != self.height or frame_bgr.shape[1] != self.width:
245
  raise ValueError(f"Frame size mismatch. Expected {self.width}x{self.height}, got {frame_bgr.shape[1]}x{frame_bgr.shape[0]}.")
246
 
247
+ # ensure contiguous for tobytes()
248
  frame_bgr = np.ascontiguousarray(frame_bgr)
249
  try:
250
  self.proc.stdin.write(frame_bgr.tobytes())
 
310
  if self.models is None:
311
  self.log.warning("CoreVideoProcessor initialized without a models provider; will use fallbacks.")
312
  self._ffmpeg = shutil.which("ffmpeg")
313
+
314
+ # state for temporal smoothing
315
  self._prev_mask: Optional[np.ndarray] = None
316
 
317
+ # ---------- mask post-processing (stability + crispness) ----------
318
+ def _iou(self, a: np.ndarray, b: np.ndarray, thr: float = 0.5) -> float:
 
319
  a_bin = (a >= thr).astype(np.uint8)
320
  b_bin = (b >= thr).astype(np.uint8)
321
+ inter = np.count_nonzero(cv2.bitwise_and(a_bin, b_bin))
322
+ union = np.count_nonzero(cv2.bitwise_or(a_bin, b_bin))
323
+ return (inter / union) if union else 0.0
324
+
325
+ def _harden(self, m: np.ndarray) -> np.ndarray:
326
+ # optional gamma
327
+ g = float(self.config.mask_gamma)
328
+ if abs(g - 1.0) > 1e-6:
329
+ m = np.clip(m, 0, 1) ** g
330
+
331
+ lo = float(self.config.hard_low)
332
+ hi = float(self.config.hard_high)
333
+ if hi > lo + 1e-6:
334
+ m = (m - lo) / (hi - lo)
335
+ m = np.clip(m, 0.0, 1.0)
336
+
337
+ # pad edges then tiny blur
338
+ k = int(self.config.dilate_px)
339
+ if k > 0:
340
+ se = cv2.getStructuringElement(cv2.MORPH_ELLIPSE, (2*k+1, 2*k+1))
341
+ m = cv2.dilate(m, se, iterations=1)
342
+
343
+ eb = int(self.config.edge_blur_px)
344
+ if eb > 0:
345
+ m = cv2.GaussianBlur(m, (2*eb+1, 2*eb+1), 0)
346
+
347
  return np.clip(m, 0.0, 1.0)
348
 
349
+ def _stabilize(self, m: np.ndarray) -> np.ndarray:
350
+ if self._prev_mask is None:
351
+ self._prev_mask = m
352
+ return m
353
+
354
+ # outlier rejection
355
+ if self._iou(self._prev_mask, m, 0.5) < float(self.config.min_iou_to_accept):
356
+ # ignore this frame's mask → keep previous
357
+ return self._prev_mask
358
+
359
+ # EMA
360
+ a = float(self.config.temporal_ema_alpha)
361
+ m_ema = a * self._prev_mask + (1.0 - a) * m
362
+ self._prev_mask = m_ema
363
+ return m_ema
364
+
365
  # ---------- Single frame ----------
366
  def process_frame(self, frame_bgr: np.ndarray, background_rgb: np.ndarray) -> Dict[str, Any]:
367
  """
368
  Process one frame:
369
  - optionally downscale for model work,
370
  - segment + refine,
371
+ - temporal stabilize + harden,
372
  - upsample mask,
 
373
  - composite full-res.
374
  Returns dict with composited frame (BGR for writer) and mask (H,W float).
375
  """
 
386
  proc_frame_bgr = cv2.resize(frame_bgr, (newW, newH), interpolation=cv2.INTER_AREA)
387
  self.log.debug(f"Model-only downscale: {W}x{H} -> {newW}x{newH} (scale={scale:.3f})")
388
 
389
+ # RGB for models
390
+ proc_frame_rgb = cv2.cvtColor(proc_frame_bgr, cv2.COLOR_BGR2RGB)
391
+
392
  predictor = None
393
  try:
394
  if self.models and hasattr(self.models, "get_sam2"):
 
396
  except Exception as e:
397
  self.log.warning(f"SAM2 predictor unavailable: {e}")
398
 
399
+ # 1) segmentation (with internal fallbacks)
400
+ mask_small = segment_person_hq(proc_frame_rgb, predictor, use_sam2=True)
 
401
 
402
+ # 2) refinement (MatAnyOne if available)
403
  matanyone = None
404
  try:
405
  if self.models and hasattr(self.models, "get_matanyone"):
 
407
  except Exception as e:
408
  self.log.warning(f"MatAnyOne unavailable: {e}")
409
 
410
+ # IMPORTANT: call order is (frame, mask, matanyone=...)
411
+ mask_small_ref = refine_mask_hq(proc_frame_rgb, mask_small, matanyone=matanyone, use_matanyone=True)
412
+
413
+ # Stabilize + harden at model scale
414
+ mask_small_ref = np.clip(mask_small_ref.astype(np.float32), 0.0, 1.0)
415
+ mask_stable = self._stabilize(mask_small_ref)
416
+ mask_stable = self._harden(mask_stable)
417
 
418
  # Upsample mask back to full-res
419
  if scale != 1.0:
420
+ mask_full = cv2.resize(mask_stable, (W, H), interpolation=cv2.INTER_LINEAR)
421
  else:
422
+ mask_full = mask_stable
 
 
 
 
 
 
 
 
 
 
 
 
 
423
 
424
+ # 3) compositing (helpers expect RGB inputs; return RGB)
 
 
 
 
 
 
425
  frame_rgb = cv2.cvtColor(frame_bgr, cv2.COLOR_BGR2RGB)
426
  out_rgb = replace_background_hq(frame_rgb, mask_full, background_rgb)
427
 
428
+ # Convert to BGR for writer
429
  out_bgr = cv2.cvtColor(out_rgb, cv2.COLOR_RGB2BGR)
430
  return {"frame": out_bgr, "mask": mask_full}
431
 
 
504
  # Background once (RGB)
505
  background_rgb = self._prepare_background_from_config(bg_config, width, height)
506
 
507
+ # reset temporal state for a new video
508
+ self._prev_mask = None
509
+
510
  # Writer selection
511
+ ffmpeg_pipe: _FFmpegPipe | None = None
512
+ writer: cv2.VideoWriter | None = None
513
  ffmpeg_failed_reason = None
514
 
515
  if self.config.use_nvenc and self._ffmpeg:
 
528
 
529
  frame_count = 0
530
  start_time = time.time()
 
 
531
  try:
532
  while True:
533
  ret, frame_bgr = cap.read()
 
548
  try:
549
  ffmpeg_pipe.write(out_bgr)
550
  except Exception as e:
551
+ # Switch to OpenCV writer mid-run and continue
552
  self.log.warning("Switching to OpenCV writer after FFmpeg error at frame %d: %s", frame_count, e)
553
  try:
554
  ffmpeg_pipe.close()