Spaces:

MogensR
/

VideoBackgroundReplacer

Paused

App Files Files Community

MogensR commited on Aug 27

Commit

ee9591a

1 Parent(s): 8ac347b

Update processing/video/video_processor.py

Browse files

Files changed (1) hide show

processing/video/video_processor.py +88 -76

processing/video/video_processor.py CHANGED Viewed

@@ -1,21 +1,15 @@
 #!/usr/bin/env python3
 """
-Compatibility shim: CoreVideoProcessor
-Stability features:
-- Temporal EMA smoothing of masks (alpha → previous; higher = calmer)
-- IoU outlier rejection (skip sudden mask jumps vs previous mask)
-- Edge padding (dilate) + small edge blur to calm shimmering edges
-Other features:
 - Accepts background configs:
     {"custom_path": "/path/to/image.png"}
     {"background_choice": "<preset_key>"}
     {"gradient": {type, start, end, angle_deg}}
 - Model-only downscale (max_model_size) for speed, full-res render.
-- FFmpeg pipe writer with encoder fallbacks (NVENC/libx264/mpeg4) and
-  stderr surfacing; falls back to OpenCV VideoWriter if FFmpeg isn't
-  available or fails mid-run.
 Requirements for the models provider:
 - get_sam2() -> predictor or None
@@ -99,16 +93,7 @@ class ProcessorConfig:
     # Model-only downscale (speedup without changing output resolution)
     max_model_size: Optional[int] = 1280
-    # ------------------ Stability knobs ------------------
-    # EMA: smoothed = alpha*prev + (1-alpha)*current  (higher alpha = calmer)
-    temporal_ema_alpha: float = 0.75     # 0.6–0.85 typical
-    # Reject frames whose mask jumps too much vs previous (IoU threshold)
-    min_iou_to_accept: float = 0.05      # 0.0 disables rejection
-    # Edge padding + blur (in pixels) to reduce edge shimmer
-    dilate_px: int = 6                   # 0 disables
-    edge_blur_px: int = 2                # 0 disables
-    # ------------------ Encoding -------------------------
     use_nvenc: bool = True
     nvenc_codec: str = "h264"            # "h264" or "hevc"
     nvenc_preset: str = "p5"             # NVENC preset string
@@ -123,6 +108,17 @@ class ProcessorConfig:
     movflags_faststart: bool = True
 class _FFmpegPipe:
     """
@@ -139,7 +135,7 @@ def __init__(self, width: int, height: int, fps: float, out_path: str, cfg: Proc
         self.proc: Optional[subprocess.Popen] = None
         self.encoder_used: Optional[str] = None
-        self._stderr: Optional[bytes] = None
         self._ffmpeg = shutil.which("ffmpeg")
         if not self._ffmpeg:
@@ -248,6 +244,7 @@ def write(self, frame_bgr: np.ndarray):
         if frame_bgr.shape[0] != self.height or frame_bgr.shape[1] != self.width:
             raise ValueError(f"Frame size mismatch. Expected {self.width}x{self.height}, got {frame_bgr.shape[1]}x{frame_bgr.shape[0]}.")
         frame_bgr = np.ascontiguousarray(frame_bgr)
         try:
             self.proc.stdin.write(frame_bgr.tobytes())
@@ -313,39 +310,66 @@ def __init__(self, config: Optional[ProcessorConfig] = None, models: Optional[An
         if self.models is None:
             self.log.warning("CoreVideoProcessor initialized without a models provider; will use fallbacks.")
         self._ffmpeg = shutil.which("ffmpeg")
-        # temporal state
         self._prev_mask: Optional[np.ndarray] = None
-    # ---------- utils: stability ----------
-    @staticmethod
-    def _mask_iou(a: np.ndarray, b: np.ndarray, thr: float = 0.5) -> float:
         a_bin = (a >= thr).astype(np.uint8)
         b_bin = (b >= thr).astype(np.uint8)
-        inter = (a_bin & b_bin).sum(dtype=np.int64)
-        union = (a_bin | b_bin).sum(dtype=np.int64)
-        return float(inter) / float(union) if union > 0 else 1.0
-    @staticmethod
-    def _dilate_and_blur(mask01: np.ndarray, dilate_px: int, blur_px: int) -> np.ndarray:
-        m = mask01
-        if dilate_px and dilate_px > 0:
-            k = max(1, int(dilate_px))
-            kernel = cv2.getStructuringElement(cv2.MORPH_ELLIPSE, (2*k+1, 2*k+1))
-            m = cv2.dilate((m*255).astype(np.uint8), kernel)
-            m = m.astype(np.float32)/255.0
-        if blur_px and blur_px > 0:
-            k = max(1, int(blur_px)*2+1)
-            m = cv2.GaussianBlur((m*255).astype(np.uint8), (k, k), 0).astype(np.float32)/255.0
         return np.clip(m, 0.0, 1.0)
     # ---------- Single frame ----------
     def process_frame(self, frame_bgr: np.ndarray, background_rgb: np.ndarray) -> Dict[str, Any]:
         """
         Process one frame:
           - optionally downscale for model work,
           - segment + refine,
           - upsample mask,
-          - temporal smoothing + IoU rejection + edge padding/blur,
           - composite full-res.
         Returns dict with composited frame (BGR for writer) and mask (H,W float).
         """
@@ -362,7 +386,9 @@ def process_frame(self, frame_bgr: np.ndarray, background_rgb: np.ndarray) -> Di
             proc_frame_bgr = cv2.resize(frame_bgr, (newW, newH), interpolation=cv2.INTER_AREA)
             self.log.debug(f"Model-only downscale: {W}x{H} -> {newW}x{newH} (scale={scale:.3f})")
-        # SAM2 predictor (if any)
         predictor = None
         try:
             if self.models and hasattr(self.models, "get_sam2"):
@@ -370,11 +396,10 @@ def process_frame(self, frame_bgr: np.ndarray, background_rgb: np.ndarray) -> Di
         except Exception as e:
             self.log.warning(f"SAM2 predictor unavailable: {e}")
-        # 1) segmentation (with fallbacks inside)
-        proc_frame_rgb_for_seg = cv2.cvtColor(proc_frame_bgr, cv2.COLOR_BGR2RGB)
-        mask_small = segment_person_hq(proc_frame_rgb_for_seg, predictor, use_sam2=True)
-        # 2) refinement (MatAnyOne if available, else robust OpenCV path)
         matanyone = None
         try:
             if self.models and hasattr(self.models, "get_matanyone"):
@@ -382,39 +407,25 @@ def process_frame(self, frame_bgr: np.ndarray, background_rgb: np.ndarray) -> Di
         except Exception as e:
             self.log.warning(f"MatAnyOne unavailable: {e}")
-        # IMPORTANT: refine_mask_hq expects (frame, mask, ...). Give it *BGR* frame.
-        mask_small_ref = refine_mask_hq(proc_frame_bgr, mask_small, matanyone=matanyone, use_matanyone=True)
         # Upsample mask back to full-res
         if scale != 1.0:
-            mask_full = cv2.resize(mask_small_ref.astype(np.float32), (W, H), interpolation=cv2.INTER_LINEAR)
         else:
-            mask_full = mask_small_ref.astype(np.float32)
-        # ----- Stability pipeline -----
-        # IoU rejection
-        if self._prev_mask is not None and self.config.min_iou_to_accept > 0.0:
-            iou = self._mask_iou(mask_full, self._prev_mask, thr=0.5)
-            if iou < float(self.config.min_iou_to_accept):
-                # jump detected → keep previous mask (skip update)
-                mask_full = self._prev_mask
-        # EMA smoothing (alpha→previous)
-        if self._prev_mask is not None and 0.0 < float(self.config.temporal_ema_alpha) < 1.0:
-            a = float(self.config.temporal_ema_alpha)
-            mask_full = a * self._prev_mask + (1.0 - a) * mask_full
-        # Edge padding + blur
-        mask_full = self._dilate_and_blur(mask_full, self.config.dilate_px, self.config.edge_blur_px)
-        # Update state
-        self._prev_mask = mask_full
-        # 3) compositing — pass RGB frame to match RGB background
         frame_rgb = cv2.cvtColor(frame_bgr, cv2.COLOR_BGR2RGB)
         out_rgb = replace_background_hq(frame_rgb, mask_full, background_rgb)
-        # Convert back to BGR for writer
         out_bgr = cv2.cvtColor(out_rgb, cv2.COLOR_RGB2BGR)
         return {"frame": out_bgr, "mask": mask_full}
@@ -493,9 +504,12 @@ def process_video(
         # Background once (RGB)
         background_rgb = self._prepare_background_from_config(bg_config, width, height)
         # Writer selection
-        ffmpeg_pipe: Optional[_FFmpegPipe] = None
-        writer: Optional[cv2.VideoWriter] = None
         ffmpeg_failed_reason = None
         if self.config.use_nvenc and self._ffmpeg:
@@ -514,8 +528,6 @@ def process_video(
         frame_count = 0
         start_time = time.time()
-        self._prev_mask = None  # reset temporal state per video
         try:
             while True:
                 ret, frame_bgr = cap.read()
@@ -536,7 +548,7 @@ def process_video(
                     try:
                         ffmpeg_pipe.write(out_bgr)
                     except Exception as e:
-                        # Switch to OpenCV writer mid-run and continue (note: output will only contain frames from this point on)
                         self.log.warning("Switching to OpenCV writer after FFmpeg error at frame %d: %s", frame_count, e)
                         try:
                             ffmpeg_pipe.close()

 #!/usr/bin/env python3
 """
+Compatibility shim: CoreVideoProcessor (stabilized + crisper edges)
 - Accepts background configs:
     {"custom_path": "/path/to/image.png"}
     {"background_choice": "<preset_key>"}
     {"gradient": {type, start, end, angle_deg}}
 - Model-only downscale (max_model_size) for speed, full-res render.
+- FFmpeg pipe writer with encoder fallbacks and stderr surfacing; falls back
+  to OpenCV VideoWriter if FFmpeg isn't available or fails mid-run.
+- Temporal smoothing + mask hardening to avoid flicker/ghosting.
 Requirements for the models provider:
 - get_sam2() -> predictor or None
     # Model-only downscale (speedup without changing output resolution)
     max_model_size: Optional[int] = 1280
+    # FFmpeg / NVENC output (pipe). If disabled or unavailable, use OpenCV writer.
     use_nvenc: bool = True
     nvenc_codec: str = "h264"            # "h264" or "hevc"
     nvenc_preset: str = "p5"             # NVENC preset string
     movflags_faststart: bool = True
+    # ---------- stability & edge quality ----------
+    temporal_ema_alpha: float = 0.75   # higher = calmer (0.6–0.85 typical)
+    min_iou_to_accept: float = 0.05    # reject sudden mask jumps
+    dilate_px: int = 6                 # pad edges to keep hair/ears/shoulders
+    edge_blur_px: int = 1              # tiny blur to calm edge shimmer
+    # hardening (turn soft mask into crisper 0/1)
+    hard_low: float = 0.35             # values below -> 0
+    hard_high: float = 0.70            # values above -> 1
+    mask_gamma: float = 0.90           # <1 boosts mid-tones slightly
 class _FFmpegPipe:
     """
         self.proc: Optional[subprocess.Popen] = None
         self.encoder_used: Optional[str] = None
+        self._stderr: bytes | None = None
         self._ffmpeg = shutil.which("ffmpeg")
         if not self._ffmpeg:
         if frame_bgr.shape[0] != self.height or frame_bgr.shape[1] != self.width:
             raise ValueError(f"Frame size mismatch. Expected {self.width}x{self.height}, got {frame_bgr.shape[1]}x{frame_bgr.shape[0]}.")
+        # ensure contiguous for tobytes()
         frame_bgr = np.ascontiguousarray(frame_bgr)
         try:
             self.proc.stdin.write(frame_bgr.tobytes())
         if self.models is None:
             self.log.warning("CoreVideoProcessor initialized without a models provider; will use fallbacks.")
         self._ffmpeg = shutil.which("ffmpeg")
+        # state for temporal smoothing
         self._prev_mask: Optional[np.ndarray] = None
+    # ---------- mask post-processing (stability + crispness) ----------
+    def _iou(self, a: np.ndarray, b: np.ndarray, thr: float = 0.5) -> float:
         a_bin = (a >= thr).astype(np.uint8)
         b_bin = (b >= thr).astype(np.uint8)
+        inter = np.count_nonzero(cv2.bitwise_and(a_bin, b_bin))
+        union = np.count_nonzero(cv2.bitwise_or(a_bin, b_bin))
+        return (inter / union) if union else 0.0
+    def _harden(self, m: np.ndarray) -> np.ndarray:
+        # optional gamma
+        g = float(self.config.mask_gamma)
+        if abs(g - 1.0) > 1e-6:
+            m = np.clip(m, 0, 1) ** g
+        lo = float(self.config.hard_low)
+        hi = float(self.config.hard_high)
+        if hi > lo + 1e-6:
+            m = (m - lo) / (hi - lo)
+            m = np.clip(m, 0.0, 1.0)
+        # pad edges then tiny blur
+        k = int(self.config.dilate_px)
+        if k > 0:
+            se = cv2.getStructuringElement(cv2.MORPH_ELLIPSE, (2*k+1, 2*k+1))
+            m = cv2.dilate(m, se, iterations=1)
+        eb = int(self.config.edge_blur_px)
+        if eb > 0:
+            m = cv2.GaussianBlur(m, (2*eb+1, 2*eb+1), 0)
         return np.clip(m, 0.0, 1.0)
+    def _stabilize(self, m: np.ndarray) -> np.ndarray:
+        if self._prev_mask is None:
+            self._prev_mask = m
+            return m
+        # outlier rejection
+        if self._iou(self._prev_mask, m, 0.5) < float(self.config.min_iou_to_accept):
+            # ignore this frame's mask → keep previous
+            return self._prev_mask
+        # EMA
+        a = float(self.config.temporal_ema_alpha)
+        m_ema = a * self._prev_mask + (1.0 - a) * m
+        self._prev_mask = m_ema
+        return m_ema
     # ---------- Single frame ----------
     def process_frame(self, frame_bgr: np.ndarray, background_rgb: np.ndarray) -> Dict[str, Any]:
         """
         Process one frame:
           - optionally downscale for model work,
           - segment + refine,
+          - temporal stabilize + harden,
           - upsample mask,
           - composite full-res.
         Returns dict with composited frame (BGR for writer) and mask (H,W float).
         """
             proc_frame_bgr = cv2.resize(frame_bgr, (newW, newH), interpolation=cv2.INTER_AREA)
             self.log.debug(f"Model-only downscale: {W}x{H} -> {newW}x{newH} (scale={scale:.3f})")
+        # RGB for models
+        proc_frame_rgb = cv2.cvtColor(proc_frame_bgr, cv2.COLOR_BGR2RGB)
         predictor = None
         try:
             if self.models and hasattr(self.models, "get_sam2"):
         except Exception as e:
             self.log.warning(f"SAM2 predictor unavailable: {e}")
+        # 1) segmentation (with internal fallbacks)
+        mask_small = segment_person_hq(proc_frame_rgb, predictor, use_sam2=True)
+        # 2) refinement (MatAnyOne if available)
         matanyone = None
         try:
             if self.models and hasattr(self.models, "get_matanyone"):
         except Exception as e:
             self.log.warning(f"MatAnyOne unavailable: {e}")
+        # IMPORTANT: call order is (frame, mask, matanyone=...)
+        mask_small_ref = refine_mask_hq(proc_frame_rgb, mask_small, matanyone=matanyone, use_matanyone=True)
+        # Stabilize + harden at model scale
+        mask_small_ref = np.clip(mask_small_ref.astype(np.float32), 0.0, 1.0)
+        mask_stable = self._stabilize(mask_small_ref)
+        mask_stable = self._harden(mask_stable)
         # Upsample mask back to full-res
         if scale != 1.0:
+            mask_full = cv2.resize(mask_stable, (W, H), interpolation=cv2.INTER_LINEAR)
         else:
+            mask_full = mask_stable
+        # 3) compositing (helpers expect RGB inputs; return RGB)
         frame_rgb = cv2.cvtColor(frame_bgr, cv2.COLOR_BGR2RGB)
         out_rgb = replace_background_hq(frame_rgb, mask_full, background_rgb)
+        # Convert to BGR for writer
         out_bgr = cv2.cvtColor(out_rgb, cv2.COLOR_RGB2BGR)
         return {"frame": out_bgr, "mask": mask_full}
         # Background once (RGB)
         background_rgb = self._prepare_background_from_config(bg_config, width, height)
+        # reset temporal state for a new video
+        self._prev_mask = None
         # Writer selection
+        ffmpeg_pipe: _FFmpegPipe | None = None
+        writer: cv2.VideoWriter | None = None
         ffmpeg_failed_reason = None
         if self.config.use_nvenc and self._ffmpeg:
         frame_count = 0
         start_time = time.time()
         try:
             while True:
                 ret, frame_bgr = cap.read()
                     try:
                         ffmpeg_pipe.write(out_bgr)
                     except Exception as e:
+                        # Switch to OpenCV writer mid-run and continue
                         self.log.warning("Switching to OpenCV writer after FFmpeg error at frame %d: %s", frame_count, e)
                         try:
                             ffmpeg_pipe.close()