Add multi-points input, foreground/background points input and box input to EfficientSAM model (#291)

* a

* add efficientsam model and basic demo

* update license

* remove example images

* update readme

* update readme

* update demo

* update demo

* update readme

* update SAM and __init__

* update demo and sam

* update label

* add present gif

* update readme

* add efficientSAM gif to readme of opencvzoo

* cv version 4.10.0， remove camera branch

* 1. add multipoints infering(max: 6)
2. add box prompt(drag), add background point(long press)
3. model fix to 1024*1024
4. label padding -1
5. update demo

* replace the model by new model support mutil-points input, update demo

* update readme

* update readme

* change window size to (800*600), pictures be put in can not exceed it

* add int8 model

* update demo

* update README

* check OpenCV version

* update model name in demo

* update model name in demo

* Add a key to exit ('q' and 'Q'); When clicks reach maximum, no box shows; comment useless print, delete useless whitespace

* update demo with some ASCII

Files changed (3) hide show

README.md +13 -5
demo.py +152 -42
efficientSAM.py +91 -28

README.md CHANGED Viewed

@@ -3,9 +3,16 @@
 EfficientSAM: Leveraged Masked Image Pretraining for Efficient Segment Anything
 Notes:
-- The current implementation of the EfficientSAM demo uses the EfficientSAM-Ti model, which is specifically tailored for scenarios requiring higher speed and lightweight.
-- MD5 value of "efficient_sam_vitt.pt" is 7A804DA508F30EFC59EC06711C8DCD62
-- SHA-256 value of "efficient_sam_vitt.pt" is DFF858B19600A46461CBB7DE98F796B23A7A888D9F5E34C0B033F7D6EB9E4E6A
 ## Demo
@@ -17,7 +24,7 @@ Run the following command to try the demo:
 python demo.py --input /path/to/image
 ```
-Click only **once** on the object you wish to segment in the displayed image. After the click, the segmentation result will be shown in a new window.
 ## Result
@@ -41,4 +48,5 @@ All files in this directory are licensed under [Apache 2.0 License](./LICENSE).
 ## Reference
 - https://arxiv.org/abs/2312.00863
-- https://github.com/yformer/EfficientSAM

 EfficientSAM: Leveraged Masked Image Pretraining for Efficient Segment Anything
 Notes:
+- The current implementation of the EfficientSAM demo uses the EfficientSAM-Ti model, which is specifically tailored for scenarios requiring higher speed and lightweight.
+- image_segmentation_efficientsam_ti_2024may.onnx(supports only single point infering)
+  - MD5 value: 117d6a6cac60039a20b399cc133c2a60
+  - SHA-256 value: e3957d2cd1422855f350aa7b044f47f5b3eafada64b5904ed330b696229e2943
+- image_segmentation_efficientsam_ti_2025april.onnx
+  - MD5 value: f23cecbb344547c960c933ff454536a3
+  - SHA-256 value: 4eb496e0a7259d435b49b66faf1754aa45a5c382a34558ddda9a8c6fe5915d77
+- image_segmentation_efficientsam_ti_2025april_int8.onnx
+  - MD5 value: a1164f44b0495b82e9807c7256e95a50
+  - SHA-256 value: 5ecc8d59a2802c32246e68553e1cf8ce74cf74ba707b84f206eb9181ff774b4e
 ## Demo
 python demo.py --input /path/to/image
 ```
+**Click** to select foreground points, **drag** to use box to select and **long press** to select background points on the object you wish to segment in the displayed image. After clicking the **Enter**, the segmentation result will be shown in a new window. Clicking the **Backspace** to clear all the prompts.
 ## Result
 ## Reference
 - https://arxiv.org/abs/2312.00863
+- https://github.com/yformer/EfficientSAM
+- https://github.com/facebookresearch/segment-anything

demo.py CHANGED Viewed

@@ -20,8 +20,8 @@ backend_target_pairs = [
 parser = argparse.ArgumentParser(description='EfficientSAM Demo')
 parser.add_argument('--input', '-i', type=str,
                     help='Set input path to a certain image.')
-parser.add_argument('--model', '-m', type=str, default='image_segmentation_efficientsam_ti_2024may.onnx',
-                    help='Set model path, defaults to image_segmentation_efficientsam_ti_2024may.onnx.')
 parser.add_argument('--backend_target', '-bt', type=int, default=0,
                     help='''Choose one of the backend-target pair to run this demo:
                         {:d}: (default) OpenCV implementation + CPU,
@@ -34,10 +34,14 @@ parser.add_argument('--save', '-s', action='store_true',
                     help='Specify to save a file with results. Invalid in case of camera input.')
 args = parser.parse_args()
-#global click listener
-clicked_left = False
-#global point record in the window
-point = []
 def visualize(image, result):
     """
@@ -55,26 +59,88 @@ def visualize(image, result):
     mask = np.copy(result)
     # change mask to binary image
     t, binary = cv.threshold(mask, 127, 255, cv.THRESH_BINARY)
-    assert set(np.unique(binary)) <= {0, 255}, "The mask must be a binary image"
     # enhance red channel to make the segmentation more obviously
     enhancement_factor = 1.8
-    red_channel = vis_result[:, :, 2]
     # update the channel
     red_channel = np.where(binary == 255, np.minimum(red_channel * enhancement_factor, 255), red_channel)
-    vis_result[:, :, 2] = red_channel
     # draw borders
     contours, hierarchy = cv.findContours(binary, cv.RETR_LIST, cv.CHAIN_APPROX_TC89_L1)
     cv.drawContours(vis_result, contours, contourIdx = -1, color = (255,255,255), thickness=2)
     return vis_result
 def select(event, x, y, flags, param):
-    global clicked_left
-    # When the left mouse button is pressed, record the coordinates of the point where it is pressed
-    if event == cv.EVENT_LBUTTONUP:
-        point.append([x,y])
-        print("point:",point[0])
-        clicked_left = True
 if __name__ == '__main__':
     backend_id = backend_target_pairs[args.backend_target][0]
@@ -89,49 +155,93 @@ if __name__ == '__main__':
             print('Could not open or find the image:', args.input)
             exit(0)
         # create window
-        image_window = "image: click on the thing whick you want to segment!"
         cv.namedWindow(image_window, cv.WINDOW_NORMAL)
         # change window size
-        cv.resizeWindow(image_window, 800 if image.shape[0] > 800 else image.shape[0], 600 if image.shape[1] > 600 else image.shape[1])
         # put the window on the left of the screen
         cv.moveWindow(image_window, 50, 100)
         # set listener to record user's click point
-        cv.setMouseCallback(image_window, select)
         # tips in the terminal
-        print("click the picture on the LEFT and see the result on the RIGHT!")
         # show image
         cv.imshow(image_window, image)
         # waiting for click
-        while cv.waitKey(1) == -1 or clicked_left:
-            # receive click
-            if clicked_left:
-                # put the click point (x,y) into the model to predict
-                result = model.infer(image=image, points=point, labels=[1])
-                # get the visualized result
-                vis_result = visualize(image, result)
-                # create window to show visualized result
-                cv.namedWindow("vis_result", cv.WINDOW_NORMAL)
-                cv.resizeWindow("vis_result", 800 if vis_result.shape[0] > 800 else vis_result.shape[0], 600 if vis_result.shape[1] > 600 else vis_result.shape[1])
-                cv.moveWindow("vis_result", 851, 100)
-                cv.imshow("vis_result", vis_result)
-                # set click false to listen another click
-                clicked_left = False
-            elif cv.getWindowProperty(image_window, cv.WND_PROP_VISIBLE) < 1:
-                # if click × to close the image window then ending
                 break
-            else:
-                # when not clicked, set point to empty
-                point = []
         cv.destroyAllWindows()
         # Save results if save is true
         if args.save:
             cv.imwrite('./example_outputs/vis_result.jpg', vis_result)
             cv.imwrite("./example_outputs/mask.jpg", result)
             print('vis_result.jpg and mask.jpg are saved to ./example_outputs/')
     else:
         print('Set input path to a certain image.')
         pass

 parser = argparse.ArgumentParser(description='EfficientSAM Demo')
 parser.add_argument('--input', '-i', type=str,
                     help='Set input path to a certain image.')
+parser.add_argument('--model', '-m', type=str, default='image_segmentation_efficientsam_ti_2025april.onnx',
+                    help='Set model path, defaults to image_segmentation_efficientsam_ti_2025april.onnx.')
 parser.add_argument('--backend_target', '-bt', type=int, default=0,
                     help='''Choose one of the backend-target pair to run this demo:
                         {:d}: (default) OpenCV implementation + CPU,
                     help='Specify to save a file with results. Invalid in case of camera input.')
 args = parser.parse_args()
+# Global configuration
+WINDOW_SIZE = (800, 600)  # Fixed window size (width, height)
+MAX_POINTS = 6             # Maximum allowed points
+points = []                # Store clicked coordinates (original image scale)
+labels = []                # Point labels (-1: useless, 0: background, 1: foreground, 2: top-left, 3: bottom right)
+backend_point = []
+rectangle = False
+current_img = None
 def visualize(image, result):
     """
     mask = np.copy(result)
     # change mask to binary image
     t, binary = cv.threshold(mask, 127, 255, cv.THRESH_BINARY)
+    assert set(np.unique(binary)) <= {0, 255}, "The mask must be a binary image."
     # enhance red channel to make the segmentation more obviously
     enhancement_factor = 1.8
+    red_channel = vis_result[:, :, 2]
     # update the channel
     red_channel = np.where(binary == 255, np.minimum(red_channel * enhancement_factor, 255), red_channel)
+    vis_result[:, :, 2] = red_channel
     # draw borders
     contours, hierarchy = cv.findContours(binary, cv.RETR_LIST, cv.CHAIN_APPROX_TC89_L1)
     cv.drawContours(vis_result, contours, contourIdx = -1, color = (255,255,255), thickness=2)
     return vis_result
 def select(event, x, y, flags, param):
+    """Handle mouse events with coordinate conversion"""
+    global points, labels, backend_point, rectangle, current_img
+    orig_img = param['original_img']
+    image_window = param['image_window']
+    if event == cv.EVENT_LBUTTONDOWN:
+        param['mouse_down_time'] = cv.getTickCount()
+        backend_point = [x, y]
+    elif event == cv.EVENT_MOUSEMOVE:
+        if rectangle == True:
+            rectangle_change_img = current_img.copy()
+            cv.rectangle(rectangle_change_img, (backend_point[0], backend_point[1]), (x, y), (255,0,0) , 2)
+            cv.imshow(image_window, rectangle_change_img)
+        elif len(backend_point) != 0 and len(points) < MAX_POINTS:
+            rectangle = True
+    elif event == cv.EVENT_LBUTTONUP:
+        if len(points) >= MAX_POINTS:
+            print(f"Maximum points reached {MAX_POINTS}.")
+            return
+        if rectangle == False:
+            duration = (cv.getTickCount() - param['mouse_down_time'])/cv.getTickFrequency()
+            label = -1 if duration > 0.5 else 1  # Long press = background
+            points.append([backend_point[0], backend_point[1]])
+            labels.append(label)
+            print(f"Added {['background','foreground','background'][label]} point {backend_point}.")
+        else:
+            if len(points) + 1 >= MAX_POINTS:
+                rectangle = False
+                backend_point.clear()
+                cv.imshow(image_window, current_img)
+                print(f"Points reached {MAX_POINTS}, could not add box.")
+                return
+            point_leftup = []
+            point_rightdown = []
+            if x > backend_point[0] or y > backend_point[1]:
+                point_leftup.extend(backend_point)
+                point_rightdown.extend([x,y])
+            else:
+                point_leftup.extend([x,y])
+                point_rightdown.extend(backend_point)
+            points.append(point_leftup)
+            points.append(point_rightdown)
+            print(f"Added box from {point_leftup} to {point_rightdown}.")
+            labels.append(2)
+            labels.append(3)
+            rectangle = False
+        backend_point.clear()
+        marked_img = orig_img.copy()
+        top_left = None
+        for (px, py), lbl in zip(points, labels):
+            if lbl == -1:
+                cv.circle(marked_img, (px, py), 5, (0, 0, 255), -1)
+            elif lbl == 1:
+                cv.circle(marked_img, (px, py), 5, (0, 255, 0), -1)
+            elif lbl == 2:
+                top_left = (px, py)
+            elif lbl == 3:
+                bottom_right = (px, py)
+                cv.rectangle(marked_img, top_left, bottom_right, (255,0,0) , 2)
+        cv.imshow(image_window, marked_img)
+        current_img = marked_img.copy()
 if __name__ == '__main__':
     backend_id = backend_target_pairs[args.backend_target][0]
             print('Could not open or find the image:', args.input)
             exit(0)
         # create window
+        image_window = "Origin image"
         cv.namedWindow(image_window, cv.WINDOW_NORMAL)
         # change window size
+        rate = 1
+        rate1 = 1
+        rate2 = 1
+        if(image.shape[1]>WINDOW_SIZE[0]):
+            rate1 = WINDOW_SIZE[0]/image.shape[1]
+        if(image.shape[0]>WINDOW_SIZE[1]):
+            rate2 = WINDOW_SIZE[1]/image.shape[0]
+        rate = min(rate1, rate2)
+        # width, height
+        WINDOW_SIZE = (int(image.shape[1] * rate), int(image.shape[0] * rate))
+        cv.resizeWindow(image_window, WINDOW_SIZE[0], WINDOW_SIZE[1])
         # put the window on the left of the screen
         cv.moveWindow(image_window, 50, 100)
         # set listener to record user's click point
+        param = {
+            'original_img': image,
+            'mouse_down_time': 0,
+            'image_window' : image_window
+        }
+        cv.setMouseCallback(image_window, select, param)
         # tips in the terminal
+        print("Click — Select foreground point\n"
+        "Long press — Select background point\n"
+        "Drag — Create selection box\n"
+        "Enter — Infer\n"
+        "Backspace — Clear the prompts\n"
+        "Q - Quit")
         # show image
         cv.imshow(image_window, image)
+        current_img = image.copy()
+        # create window to show visualized result
+        vis_image = image.copy()
+        segmentation_window = "Segment result"
+        cv.namedWindow(segmentation_window, cv.WINDOW_NORMAL)
+        cv.resizeWindow(segmentation_window, WINDOW_SIZE[0], WINDOW_SIZE[1])
+        cv.moveWindow(segmentation_window, WINDOW_SIZE[0]+51, 100)
+        cv.imshow(segmentation_window, vis_image)
         # waiting for click
+        while True:
+            # Check window status
+            # if click × to close the image window then ending
+            if (cv.getWindowProperty(image_window, cv.WND_PROP_VISIBLE) < 1 or
+                cv.getWindowProperty(segmentation_window, cv.WND_PROP_VISIBLE) < 1):
                 break
+            # Handle keyboard input
+            key = cv.waitKey(1)
+            # receive enter
+            if key == 13:
+                vis_image = image.copy()
+                cv.putText(vis_image, "infering...",
+                            (50, vis_image.shape[0]//2),
+                            cv.FONT_HERSHEY_SIMPLEX, 10, (255,255,255), 5)
+                cv.imshow(segmentation_window, vis_image)
+                result = model.infer(image=image, points=points, labels=labels)
+                if len(result) == 0:
+                    print("clear and select points again!")
+                else:
+                    vis_result = visualize(image, result)
+                    cv.imshow(segmentation_window, vis_result)
+            elif key == 8 or key == 127:  # ASCII for Backspace or Delete
+                points.clear()
+                labels.clear()
+                backend_point = []
+                rectangle = False
+                current_img = image
+                print("Points are cleared.")
+                cv.imshow(image_window, image)
+            elif key == ord('q') or key == ord('Q'):
+                break
         cv.destroyAllWindows()
         # Save results if save is true
         if args.save:
             cv.imwrite('./example_outputs/vis_result.jpg', vis_result)
             cv.imwrite("./example_outputs/mask.jpg", result)
             print('vis_result.jpg and mask.jpg are saved to ./example_outputs/')
     else:
         print('Set input path to a certain image.')
         pass

efficientSAM.py CHANGED Viewed

@@ -11,11 +11,15 @@ class EfficientSAM:
         self._model.setPreferableBackend(self._backendId)
         self._model.setPreferableTarget(self._targetId)
         # 3 inputs
-        self._inputNames = ["batched_images", "batched_point_coords", "batched_point_labels"]
-        self._outputNames = ['output_masks']  # actual output layer name
         self._currentInputSize = None
-        self._inputSize = [640, 640]  # input size for the model
     @property
     def name(self):
@@ -28,26 +32,54 @@ class EfficientSAM:
         self._model.setPreferableTarget(self._targetId)
     def _preprocess(self, image, points, labels):
         image = cv.cvtColor(image, cv.COLOR_BGR2RGB)
         # record the input image size, (width, height)
         self._currentInputSize = (image.shape[1], image.shape[0])
         image = cv.resize(image, self._inputSize)
         image = image.astype(np.float32, copy=False) / 255.0
-        # convert points to (640*640) size space
-        for p in points:
-            p[0] = int(p[0] * self._inputSize[0]/self._currentInputSize[0])
-            p[1] = int(p[1]* self._inputSize[1]/self._currentInputSize[1])
         image_blob = cv.dnn.blobFromImage(image)
-        points_blob = np.array([[points]], dtype=np.float32)
-        labels_blob = np.array([[[labels]]])
         return image_blob, points_blob, labels_blob
     def infer(self, image, points, labels):
@@ -57,17 +89,48 @@ class EfficientSAM:
         self._model.setInput(imageBlob, self._inputNames[0])
         self._model.setInput(pointsBlob, self._inputNames[1])
         self._model.setInput(labelsBlob, self._inputNames[2])
-        outputBlob = self._model.forward()
         # Postprocess
-        results = self._postprocess(outputBlob)
         return results
-    def _postprocess(self, outputBlob):
-        mask = outputBlob[0, 0, 0, :, :] >= 0
-        mask_uint8 = (mask * 255).astype(np.uint8)
         # change to real image size
-        mask_uint8 = cv.resize(mask_uint8, dsize=self._currentInputSize, interpolation=2)
-        return mask_uint8

         self._model.setPreferableBackend(self._backendId)
         self._model.setPreferableTarget(self._targetId)
         # 3 inputs
+        self._inputNames = ["batched_images", "batched_point_coords", "batched_point_labels"]
+        self._outputNames = ['output_masks', 'iou_predictions']  # actual output layer name
         self._currentInputSize = None
+        self._inputSize = [1024, 1024]  # input size for the model
+        self._maxPointNums = 6
+        self._frontGroundPoints = []
+        self._backGroundPoints = []
+        self._labels = []
     @property
     def name(self):
         self._model.setPreferableTarget(self._targetId)
     def _preprocess(self, image, points, labels):
         image = cv.cvtColor(image, cv.COLOR_BGR2RGB)
         # record the input image size, (width, height)
         self._currentInputSize = (image.shape[1], image.shape[0])
         image = cv.resize(image, self._inputSize)
         image = image.astype(np.float32, copy=False) / 255.0
         image_blob = cv.dnn.blobFromImage(image)
+        points = np.array(points, dtype=np.float32)
+        labels = np.array(labels, dtype=np.float32)
+        assert points.shape[0] <= self._maxPointNums, f"Max input points number: {self._maxPointNums}"
+        assert points.shape[0] == labels.shape[0]
+        frontGroundPoints = []
+        backGroundPoints = []
+        inputLabels = []
+        for i in range(len(points)):
+            if labels[i] == -1:
+                backGroundPoints.append(points[i])
+            else:
+                frontGroundPoints.append(points[i])
+                inputLabels.append(labels[i])
+        self._backGroundPoints = np.uint32(backGroundPoints)
+        # print("input:")
+        # print(" back: ", self._backGroundPoints)
+        # print(" front: ", frontGroundPoints)
+        # print(" label: ", inputLabels)
+        # convert points to (1024*1024) size space
+        for p in frontGroundPoints:
+            p[0] = np.float32(p[0] * self._inputSize[0]/self._currentInputSize[0])
+            p[1] = np.float32(p[1] * self._inputSize[1]/self._currentInputSize[1])
+        if len(frontGroundPoints) > self._maxPointNums:
+            return "no"
+        pad_num = self._maxPointNums - len(frontGroundPoints)
+        self._frontGroundPoints = np.vstack([frontGroundPoints, np.zeros((pad_num, 2), dtype=np.float32)])
+        inputLabels_arr = np.array(inputLabels, dtype=np.float32).reshape(-1, 1)
+        self._labels = np.vstack([inputLabels_arr, np.full((pad_num, 1), -1, dtype=np.float32)])
+        points_blob = np.array([[self._frontGroundPoints]])
+        labels_blob = np.array([[self._labels]])
         return image_blob, points_blob, labels_blob
     def infer(self, image, points, labels):
         self._model.setInput(imageBlob, self._inputNames[0])
         self._model.setInput(pointsBlob, self._inputNames[1])
         self._model.setInput(labelsBlob, self._inputNames[2])
+        # print("infering...")
+        outputs = self._model.forward(self._outputNames)
+        outputBlob, outputIou = outputs[0], outputs[1]
         # Postprocess
+        results = self._postprocess(outputBlob, outputIou)
+        # print("done")
         return results
+    def _postprocess(self, outputBlob, outputIou):
+        # The masks are already sorted by their predicted IOUs.
+        # The first dimension is the batch size (we have a single image. so it is 1).
+        # The second dimension is the number of masks we want to generate
+        # The third dimension is the number of candidate masks output by the model.
+        masks = outputBlob[0, 0, :, :, :] >= 0
+        ious = outputIou[0, 0, :]
+        # sorted by ious
+        sorted_indices = np.argsort(ious)[::-1]
+        sorted_masks = masks[sorted_indices]
+        # sorted by area
+        # mask_areas = np.sum(masks, axis=(1, 2))
+        # sorted_indices = np.argsort(mask_areas)
+        # sorted_masks = masks[sorted_indices]
+        masks_uint8 = (sorted_masks * 255).astype(np.uint8)
         # change to real image size
+        resized_masks = [
+            cv.resize(mask, dsize=self._currentInputSize,
+                    interpolation=cv.INTER_NEAREST)
+            for mask in masks_uint8
+        ]
+        # background mask don't need
+        for mask in resized_masks:
+            contains_bg = any(
+                mask[y, x] if (0 <= x < mask.shape[1] and 0 <= y < mask.shape[0])
+                else False
+                for (x, y) in self._backGroundPoints
+            )
+            if not contains_bg:
+                return mask
+        return resized_masks[0]