Spaces:

MogensR
/

VideoBackgroundReplacer

Paused

App Files Files Community

MogensR commited on Aug 28

Commit

b8dd531

1 Parent(s): 4f1de42

Update processing/video/video_processor.py

Browse files

Files changed (1) hide show

processing/video/video_processor.py +259 -39

processing/video/video_processor.py CHANGED Viewed

@@ -10,6 +10,8 @@
 - FFmpeg pipe writer with encoder fallbacks and stderr surfacing; falls back
   to OpenCV VideoWriter if FFmpeg isn't available or fails mid-run.
 - Temporal smoothing + mask hardening to avoid flicker/ghosting.
 Requirements for the models provider:
 - get_sam2() -> predictor or None
@@ -19,7 +21,7 @@
 from __future__ import annotations
 from dataclasses import dataclass
-from typing import Optional, Dict, Any, Callable
 import time
 import threading
 import shutil
@@ -119,6 +121,10 @@ class ProcessorConfig:
     hard_high: float = 0.70            # values above -> 1
     mask_gamma: float = 0.90           # <1 boosts mid-tones slightly
 class _FFmpegPipe:
     """
@@ -362,10 +368,10 @@ def _stabilize(self, m: np.ndarray) -> np.ndarray:
         self._prev_mask = m_ema
         return m_ema
-    # ---------- Single frame ----------
     def process_frame(self, frame_bgr: np.ndarray, background_rgb: np.ndarray) -> Dict[str, Any]:
         """
-        Process one frame:
           - optionally downscale for model work,
           - segment + refine,
           - temporal stabilize + harden,
@@ -473,6 +479,50 @@ def _prepare_background_from_config(
         return create_professional_background(choice, width, height)  # RGB
     # ---------- Full video ----------
     def process_video(
         self,
@@ -526,54 +576,224 @@ def process_video(
                 cap.release()
                 raise RuntimeError(f"Could not open VideoWriter for: {output_path}")
         frame_count = 0
         start_time = time.time()
         try:
-            while True:
-                ret, frame_bgr = cap.read()
-                if not ret:
-                    break
-                if stop_event is not None and stop_event.is_set():
-                    self.log.info("Processing stopped by user request.")
-                    break
-                # Process single frame
-                result = self.process_frame(frame_bgr, background_rgb)
-                out_bgr = result["frame"]
-                out_bgr = np.ascontiguousarray(out_bgr)  # ensure contiguous for tobytes()
-                # Write
-                if ffmpeg_pipe is not None:
                     try:
-                        ffmpeg_pipe.write(out_bgr)
                     except Exception as e:
-                        # Switch to OpenCV writer mid-run and continue
-                        self.log.warning("Switching to OpenCV writer after FFmpeg error at frame %d: %s", frame_count, e)
                         try:
-                            ffmpeg_pipe.close()
                         except Exception:
                             pass
-                        ffmpeg_pipe = None
-                        if writer is None:
-                            fourcc = cv2.VideoWriter_fourcc(*"mp4v")
-                            writer = cv2.VideoWriter(output_path, fourcc, float(fps_out), (width, height))
-                            if not writer.isOpened():
-                                raise RuntimeError(f"FFmpeg failed and VideoWriter could not open: {output_path}")
-                        writer.write(out_bgr)
-                else:
-                    writer.write(out_bgr)
-                frame_count += 1
-                # Progress
-                if progress_callback:
-                    elapsed = time.time() - start_time
-                    fps_live = frame_count / elapsed if elapsed > 0 else 0.0
                     try:
-                        progress_callback(frame_count, total_frames, fps_live)
                     except Exception:
                         pass
         finally:
             cap.release()
             if writer is not None:

 - FFmpeg pipe writer with encoder fallbacks and stderr surfacing; falls back
   to OpenCV VideoWriter if FFmpeg isn't available or fails mid-run.
 - Temporal smoothing + mask hardening to avoid flicker/ghosting.
+- NEW: Windowed two-phase execution (SAM2 window → release → MatAnyone window)
+  to avoid GPU fragmentation/OOM on T4 (16GB).
 Requirements for the models provider:
 - get_sam2() -> predictor or None
 from __future__ import annotations
 from dataclasses import dataclass
+from typing import Optional, Dict, Any, Callable, List, Tuple
 import time
 import threading
 import shutil
     hard_high: float = 0.70            # values above -> 1
     mask_gamma: float = 0.90           # <1 boosts mid-tones slightly
+    # ---------- NEW: windowed two-phase control ----------
+    use_windowed: bool = True          # enable two-phase SAM2→MatAnyone per chunk
+    window_size: int = 8               # frames per window
 class _FFmpegPipe:
     """
         self._prev_mask = m_ema
         return m_ema
+    # ---------- Single frame (fallback path) ----------
     def process_frame(self, frame_bgr: np.ndarray, background_rgb: np.ndarray) -> Dict[str, Any]:
         """
+        Process one frame (legacy per-frame path):
           - optionally downscale for model work,
           - segment + refine,
           - temporal stabilize + harden,
         return create_professional_background(choice, width, height)  # RGB
+    # ---------- Windowed two-phase helpers ----------
+    def _model_downscale(self, frame_bgr: np.ndarray) -> Tuple[np.ndarray, float]:
+        """Apply model-only downscale; return (resized_bgr, scale)."""
+        H, W = frame_bgr.shape[:2]
+        max_side = max(H, W)
+        if self.config.max_model_size and max_side > self.config.max_model_size:
+            s = self.config.max_model_size / float(max_side)
+            newW = int(round(W * s))
+            newH = int(round(H * s))
+            small = cv2.resize(frame_bgr, (newW, newH), interpolation=cv2.INTER_AREA)
+            return small, s
+        return frame_bgr, 1.0
+    def _release_sam2_gpu(self, predictor):
+        """Best-effort release of SAM2 GPU residency between phases."""
+        try:
+            if predictor is None:
+                return
+            # Clear any sticky per-image state if exposed
+            for name in ("reset_image", "release_image", "clear_image", "clear_state"):
+                if hasattr(predictor, name) and callable(getattr(predictor, name)):
+                    try:
+                        getattr(predictor, name)()
+                    except Exception:
+                        pass
+            # Try moving large parts off-GPU (best-effort, may be no-op)
+            for name in ("to", "cpu"):
+                if hasattr(predictor, name):
+                    try:
+                        if name == "to":
+                            predictor.to("cpu")  # type: ignore[attr-defined]
+                        else:
+                            predictor.cpu()      # type: ignore[attr-defined]
+                    except Exception:
+                        pass
+        except Exception:
+            pass
+        try:
+            import torch
+            if torch.cuda.is_available():
+                torch.cuda.empty_cache()
+        except Exception:
+            pass
     # ---------- Full video ----------
     def process_video(
         self,
                 cap.release()
                 raise RuntimeError(f"Could not open VideoWriter for: {output_path}")
+        # Determine models and decide execution mode
+        predictor = None
+        matanyone = None
+        try:
+            if self.models and hasattr(self.models, "get_sam2"):
+                predictor = self.models.get_sam2()
+        except Exception as e:
+            self.log.warning(f"SAM2 predictor unavailable: {e}")
+        try:
+            if self.models and hasattr(self.models, "get_matanyone"):
+                matanyone = self.models.get_matanyone()
+        except Exception as e:
+            self.log.warning(f"MatAnyOne unavailable: {e}")
+        use_windowed = bool(self.config.use_windowed and predictor is not None and matanyone is not None)
         frame_count = 0
         start_time = time.time()
         try:
+            if not use_windowed:
+                # --------- Legacy per-frame path (fallback) ----------
+                while True:
+                    ret, frame_bgr = cap.read()
+                    if not ret:
+                        break
+                    if stop_event is not None and stop_event.is_set():
+                        self.log.info("Processing stopped by user request.")
+                        break
+                    result = self.process_frame(frame_bgr, background_rgb)
+                    out_bgr = np.ascontiguousarray(result["frame"])
+                    if ffmpeg_pipe is not None:
+                        try:
+                            ffmpeg_pipe.write(out_bgr)
+                        except Exception as e:
+                            self.log.warning("Switching to OpenCV writer after FFmpeg error at frame %d: %s", frame_count, e)
+                            try:
+                                ffmpeg_pipe.close()
+                            except Exception:
+                                pass
+                            ffmpeg_pipe = None
+                            if writer is None:
+                                fourcc = cv2.VideoWriter_fourcc(*"mp4v")
+                                writer = cv2.VideoWriter(output_path, fourcc, float(fps_out), (width, height))
+                                if not writer.isOpened():
+                                    raise RuntimeError(f"FFmpeg failed and VideoWriter could not open: {output_path}")
+                            writer.write(out_bgr)
+                    else:
+                        writer.write(out_bgr)
+                    frame_count += 1
+                    if progress_callback:
+                        elapsed = time.time() - start_time
+                        fps_live = frame_count / elapsed if elapsed > 0 else 0.0
+                        try: progress_callback(frame_count, total_frames, fps_live)
+                        except Exception: pass
+            else:
+                # --------- Windowed two-phase path ----------
+                WINDOW = max(1, int(self.config.window_size))
+                while True:
+                    # Read a window of frames
+                    frames_bgr: List[np.ndarray] = []
+                    for _ in range(WINDOW):
+                        ret, fr = cap.read()
+                        if not ret:
+                            break
+                        frames_bgr.append(fr)
+                    if not frames_bgr:
+                        break  # no more frames
+                    if stop_event is not None and stop_event.is_set():
+                        self.log.info("Processing stopped by user request.")
+                        break
+                    # Model-only downscale frames for model work (consistent per window)
+                    frames_small_bgr: List[np.ndarray] = []
+                    scales: List[float] = []
+                    for fr in frames_bgr:
+                        fr_small, s = self._model_downscale(fr)
+                        frames_small_bgr.append(fr_small)
+                        scales.append(s)
+                    # Use the first scale (frames normally same size)
+                    scale = scales[0] if scales else 1.0
+                    # Convert small frames to RGB for models
+                    frames_small_rgb = [cv2.cvtColor(fb, cv2.COLOR_BGR2RGB) for fb in frames_small_bgr]
+                    # -------- SAM2 phase (prime with first frame's mask) --------
+                    # We only need the mask for the first frame in the window.
                     try:
+                        mask_small = segment_person_hq(frames_small_rgb[0], predictor, use_sam2=True)
                     except Exception as e:
+                        self.log.warning(f"SAM2 segmentation error on window start: {e}")
+                        # Fall back to simple segmentation in helper
+                        mask_small = segment_person_hq(frames_small_rgb[0], None, use_sam2=False)
+                    # Release SAM2 GPU residency before MatAnyone phase
+                    self._release_sam2_gpu(predictor)
+                    # -------- MatAnyone phase (prime + propagate) --------
+                    # Reset session at window start if supported
+                    if hasattr(matanyone, "reset"):
                         try:
+                            matanyone.reset()
                         except Exception:
                             pass
+                    # j==0: pass 2-D mask; j>0: propagate without mask
+                    for j, fr_rgb_small in enumerate(frames_small_rgb):
+                        try:
+                            if j == 0:
+                                m2d = mask_small
+                                if m2d.ndim == 3:
+                                    m2d = m2d[..., 0]
+                                alpha_small = matanyone(fr_rgb_small, m2d)  # adapter returns float32 [h,w]
+                            else:
+                                alpha_small = matanyone(fr_rgb_small)      # propagate (no mask)
+                            # Stabilize + harden at model scale
+                            alpha_small = np.clip(alpha_small.astype(np.float32), 0.0, 1.0)
+                            alpha_stable = self._stabilize(alpha_small)
+                            alpha_harden = self._harden(alpha_stable)
+                            # Upsample back to full-res
+                            if scale != 1.0:
+                                H, W = frames_bgr[j].shape[:2]
+                                alpha_full = cv2.resize(alpha_harden, (W, H), interpolation=cv2.INTER_LINEAR)
+                            else:
+                                alpha_full = alpha_harden
+                            # Composite at full-res (expects RGB)
+                            frame_rgb_full = cv2.cvtColor(frames_bgr[j], cv2.COLOR_BGR2RGB)
+                            out_rgb = replace_background_hq(frame_rgb_full, alpha_full, background_rgb)
+                            out_bgr = cv2.cvtColor(out_rgb, cv2.COLOR_RGB2BGR)
+                            out_bgr = np.ascontiguousarray(out_bgr)
+                            # Write
+                            if ffmpeg_pipe is not None:
+                                try:
+                                    ffmpeg_pipe.write(out_bgr)
+                                except Exception as e:
+                                    self.log.warning("Switching to OpenCV writer after FFmpeg error at frame %d: %s", frame_count, e)
+                                    try:
+                                        ffmpeg_pipe.close()
+                                    except Exception:
+                                        pass
+                                    ffmpeg_pipe = None
+                                    if writer is None:
+                                        fourcc = cv2.VideoWriter_fourcc(*"mp4v")
+                                        writer = cv2.VideoWriter(output_path, fourcc, float(fps_out), (width, height))
+                                        if not writer.isOpened():
+                                            raise RuntimeError(f"FFmpeg failed and VideoWriter could not open: {output_path}")
+                                    writer.write(out_bgr)
+                            else:
+                                writer.write(out_bgr)
+                            frame_count += 1
+                        except Exception as e:
+                            # If MatAnyone fails, log and fall back to SAM-only for this frame
+                            self.log.warning(f"MatAnyone failed at window frame {j}: {e}")
+                            # basic fallback: composite with original SAM mask for j==0, else reuse prev mask
+                            if j == 0:
+                                alpha_small_fb = np.clip(mask_small.astype(np.float32), 0.0, 1.0)
+                            else:
+                                alpha_small_fb = self._prev_mask if self._prev_mask is not None else np.zeros_like(alpha_small, dtype=np.float32)
+                            if scale != 1.0:
+                                H, W = frames_bgr[j].shape[:2]
+                                alpha_full_fb = cv2.resize(alpha_small_fb, (W, H), interpolation=cv2.INTER_LINEAR)
+                            else:
+                                alpha_full_fb = alpha_small_fb
+                            frame_rgb_full = cv2.cvtColor(frames_bgr[j], cv2.COLOR_BGR2RGB)
+                            out_rgb_fb = replace_background_hq(frame_rgb_full, alpha_full_fb, background_rgb)
+                            out_bgr_fb = cv2.cvtColor(out_rgb_fb, cv2.COLOR_RGB2BGR)
+                            if ffmpeg_pipe is not None:
+                                try:
+                                    ffmpeg_pipe.write(np.ascontiguousarray(out_bgr_fb))
+                                except Exception:
+                                    try:
+                                        ffmpeg_pipe.close()
+                                    except Exception:
+                                        pass
+                                    ffmpeg_pipe = None
+                                    if writer is None:
+                                        fourcc = cv2.VideoWriter_fourcc(*"mp4v")
+                                        writer = cv2.VideoWriter(output_path, fourcc, float(fps_out), (width, height))
+                                        if not writer.isOpened():
+                                            raise RuntimeError(f"FFmpeg failed and VideoWriter could not open: {output_path}")
+                                    writer.write(np.ascontiguousarray(out_bgr_fb))
+                            else:
+                                writer.write(np.ascontiguousarray(out_bgr_fb))
+                            frame_count += 1
+                        # Progress update
+                        if progress_callback:
+                            elapsed = time.time() - start_time
+                            fps_live = frame_count / elapsed if elapsed > 0 else 0.0
+                            try: progress_callback(frame_count, total_frames, fps_live)
+                            except Exception: pass
+                    # Clean per-window buffers (CPU) and let CUDA defrag
+                    del frames_bgr, frames_small_bgr, frames_small_rgb, mask_small
                     try:
+                        import torch
+                        if torch.cuda.is_available():
+                            torch.cuda.empty_cache()
                     except Exception:
                         pass
         finally:
             cap.release()
             if writer is not None: