Spaces:

ustc-community
/

d-fine-object-detection

Running on Zero

App Files Files Community

qubvel-hf commited on May 2

Commit

9d19ec6

1 Parent(s): 9f2cbff

Update

Browse files

Files changed (1) hide show

app.py +17 -7

app.py CHANGED Viewed

@@ -53,7 +53,7 @@ IMAGE_EXAMPLES = [
 ]
 # Video
-MAX_NUM_FRAMES = 500
 BATCH_SIZE = 4
 ALLOWED_VIDEO_EXTENSIONS = {".mp4", ".avi", ".mov"}
 VIDEO_OUTPUT_DIR = Path("static/videos")
@@ -70,18 +70,28 @@ logging.basicConfig(
 logger = logging.getLogger(__name__)
 @spaces.GPU(duration=20)
 def detect_objects(
     checkpoint: str,
-    images: List[np.ndarray],
     confidence_threshold: float = DEFAULT_CONFIDENCE_THRESHOLD,
     target_size: Optional[Tuple[int, int]] = None,
     batch_size: int = BATCH_SIZE,
 ):
     device = "cuda" if torch.cuda.is_available() else "cpu"
-    model = AutoModelForObjectDetection.from_pretrained(checkpoint, torch_dtype=TORCH_DTYPE).to(device)
-    image_processor = AutoImageProcessor.from_pretrained(checkpoint)
     batches = [images[i:i + batch_size] for i in range(0, len(images), batch_size)]
@@ -205,12 +215,13 @@ def process_video(
     n_frames_to_read = min(MAX_NUM_FRAMES, video_info.total_frames // read_each_i_frame)
     frames = read_video_k_frames(video_path, n_frames_to_read, read_each_i_frame)
     box_annotator = sv.BoxAnnotator(thickness=1)
     label_annotator = sv.LabelAnnotator(text_scale=0.5)
     results, id2label = detect_objects(
-        images=frames,
         checkpoint=checkpoint,
         confidence_threshold=confidence_threshold,
         target_size=(target_height, target_width),
@@ -218,7 +229,6 @@ def process_video(
     annotated_frames = []
     for frame, result in tqdm.tqdm(zip(frames, results), desc="Annotating frames", total=len(frames)):
-        frame = cv2.resize(frame, (target_width, target_height), interpolation=cv2.INTER_AREA)
         detections = sv.Detections.from_transformers(result, id2label=id2label)
         detections = detections.with_nms(threshold=0.95, class_agnostic=True)
         annotated_frame = box_annotator.annotate(scene=frame, detections=detections)
@@ -226,7 +236,7 @@ def process_video(
         annotated_frames.append(annotated_frame)
     output_filename = os.path.join(VIDEO_OUTPUT_DIR, f"output_{uuid.uuid4()}.mp4")
-    iio.imwrite(output_filename, annotated_frames, fps=target_fps, codec="h264") #, pixelformat="yuv420p")
     return output_filename

 ]
 # Video
+MAX_NUM_FRAMES = 250
 BATCH_SIZE = 4
 ALLOWED_VIDEO_EXTENSIONS = {".mp4", ".avi", ".mov"}
 VIDEO_OUTPUT_DIR = Path("static/videos")
 logger = logging.getLogger(__name__)
+@lru_cache(maxsize=3)
+def get_model_and_processor(checkpoint: str):
+    model = AutoModelForObjectDetection.from_pretrained(checkpoint, torch_dtype=TORCH_DTYPE)
+    image_processor = AutoImageProcessor.from_pretrained(checkpoint)
+    return model, image_processor
 @spaces.GPU(duration=20)
 def detect_objects(
     checkpoint: str,
+    images: List[np.ndarray] | np.ndarray,
     confidence_threshold: float = DEFAULT_CONFIDENCE_THRESHOLD,
     target_size: Optional[Tuple[int, int]] = None,
     batch_size: int = BATCH_SIZE,
 ):
     device = "cuda" if torch.cuda.is_available() else "cpu"
+    model, image_processor = get_model_and_processor(checkpoint)
+    model = model.to(device)
+    if isinstance(images, np.ndarray) and images.ndim == 4:
+        images = [x for x in images]  # split video array into list of images
     batches = [images[i:i + batch_size] for i in range(0, len(images), batch_size)]
     n_frames_to_read = min(MAX_NUM_FRAMES, video_info.total_frames // read_each_i_frame)
     frames = read_video_k_frames(video_path, n_frames_to_read, read_each_i_frame)
+    frames = [cv2.resize(frame, (target_width, target_height), interpolation=cv2.INTER_CUBIC) for frame in frames]
     box_annotator = sv.BoxAnnotator(thickness=1)
     label_annotator = sv.LabelAnnotator(text_scale=0.5)
     results, id2label = detect_objects(
+        images=np.array(frames),
         checkpoint=checkpoint,
         confidence_threshold=confidence_threshold,
         target_size=(target_height, target_width),
     annotated_frames = []
     for frame, result in tqdm.tqdm(zip(frames, results), desc="Annotating frames", total=len(frames)):
         detections = sv.Detections.from_transformers(result, id2label=id2label)
         detections = detections.with_nms(threshold=0.95, class_agnostic=True)
         annotated_frame = box_annotator.annotate(scene=frame, detections=detections)
         annotated_frames.append(annotated_frame)
     output_filename = os.path.join(VIDEO_OUTPUT_DIR, f"output_{uuid.uuid4()}.mp4")
+    iio.imwrite(output_filename, annotated_frames, fps=target_fps, codec="h264")
     return output_filename