Zhang-Yang-Sustech
Add multi-points input, foreground/background points input and box input to EfficientSAM model (#291)
bf92df0
| import numpy as np | |
| import cv2 as cv | |
| class EfficientSAM: | |
| def __init__(self, modelPath, backendId=0, targetId=0): | |
| self._modelPath = modelPath | |
| self._backendId = backendId | |
| self._targetId = targetId | |
| self._model = cv.dnn.readNet(self._modelPath) | |
| self._model.setPreferableBackend(self._backendId) | |
| self._model.setPreferableTarget(self._targetId) | |
| # 3 inputs | |
| self._inputNames = ["batched_images", "batched_point_coords", "batched_point_labels"] | |
| self._outputNames = ['output_masks', 'iou_predictions'] # actual output layer name | |
| self._currentInputSize = None | |
| self._inputSize = [1024, 1024] # input size for the model | |
| self._maxPointNums = 6 | |
| self._frontGroundPoints = [] | |
| self._backGroundPoints = [] | |
| self._labels = [] | |
| def name(self): | |
| return self.__class__.__name__ | |
| def setBackendAndTarget(self, backendId, targetId): | |
| self._backendId = backendId | |
| self._targetId = targetId | |
| self._model.setPreferableBackend(self._backendId) | |
| self._model.setPreferableTarget(self._targetId) | |
| def _preprocess(self, image, points, labels): | |
| image = cv.cvtColor(image, cv.COLOR_BGR2RGB) | |
| # record the input image size, (width, height) | |
| self._currentInputSize = (image.shape[1], image.shape[0]) | |
| image = cv.resize(image, self._inputSize) | |
| image = image.astype(np.float32, copy=False) / 255.0 | |
| image_blob = cv.dnn.blobFromImage(image) | |
| points = np.array(points, dtype=np.float32) | |
| labels = np.array(labels, dtype=np.float32) | |
| assert points.shape[0] <= self._maxPointNums, f"Max input points number: {self._maxPointNums}" | |
| assert points.shape[0] == labels.shape[0] | |
| frontGroundPoints = [] | |
| backGroundPoints = [] | |
| inputLabels = [] | |
| for i in range(len(points)): | |
| if labels[i] == -1: | |
| backGroundPoints.append(points[i]) | |
| else: | |
| frontGroundPoints.append(points[i]) | |
| inputLabels.append(labels[i]) | |
| self._backGroundPoints = np.uint32(backGroundPoints) | |
| # print("input:") | |
| # print(" back: ", self._backGroundPoints) | |
| # print(" front: ", frontGroundPoints) | |
| # print(" label: ", inputLabels) | |
| # convert points to (1024*1024) size space | |
| for p in frontGroundPoints: | |
| p[0] = np.float32(p[0] * self._inputSize[0]/self._currentInputSize[0]) | |
| p[1] = np.float32(p[1] * self._inputSize[1]/self._currentInputSize[1]) | |
| if len(frontGroundPoints) > self._maxPointNums: | |
| return "no" | |
| pad_num = self._maxPointNums - len(frontGroundPoints) | |
| self._frontGroundPoints = np.vstack([frontGroundPoints, np.zeros((pad_num, 2), dtype=np.float32)]) | |
| inputLabels_arr = np.array(inputLabels, dtype=np.float32).reshape(-1, 1) | |
| self._labels = np.vstack([inputLabels_arr, np.full((pad_num, 1), -1, dtype=np.float32)]) | |
| points_blob = np.array([[self._frontGroundPoints]]) | |
| labels_blob = np.array([[self._labels]]) | |
| return image_blob, points_blob, labels_blob | |
| def infer(self, image, points, labels): | |
| # Preprocess | |
| imageBlob, pointsBlob, labelsBlob = self._preprocess(image, points, labels) | |
| # Forward | |
| self._model.setInput(imageBlob, self._inputNames[0]) | |
| self._model.setInput(pointsBlob, self._inputNames[1]) | |
| self._model.setInput(labelsBlob, self._inputNames[2]) | |
| # print("infering...") | |
| outputs = self._model.forward(self._outputNames) | |
| outputBlob, outputIou = outputs[0], outputs[1] | |
| # Postprocess | |
| results = self._postprocess(outputBlob, outputIou) | |
| # print("done") | |
| return results | |
| def _postprocess(self, outputBlob, outputIou): | |
| # The masks are already sorted by their predicted IOUs. | |
| # The first dimension is the batch size (we have a single image. so it is 1). | |
| # The second dimension is the number of masks we want to generate | |
| # The third dimension is the number of candidate masks output by the model. | |
| masks = outputBlob[0, 0, :, :, :] >= 0 | |
| ious = outputIou[0, 0, :] | |
| # sorted by ious | |
| sorted_indices = np.argsort(ious)[::-1] | |
| sorted_masks = masks[sorted_indices] | |
| # sorted by area | |
| # mask_areas = np.sum(masks, axis=(1, 2)) | |
| # sorted_indices = np.argsort(mask_areas) | |
| # sorted_masks = masks[sorted_indices] | |
| masks_uint8 = (sorted_masks * 255).astype(np.uint8) | |
| # change to real image size | |
| resized_masks = [ | |
| cv.resize(mask, dsize=self._currentInputSize, | |
| interpolation=cv.INTER_NEAREST) | |
| for mask in masks_uint8 | |
| ] | |
| # background mask don't need | |
| for mask in resized_masks: | |
| contains_bg = any( | |
| mask[y, x] if (0 <= x < mask.shape[1] and 0 <= y < mask.shape[0]) | |
| else False | |
| for (x, y) in self._backGroundPoints | |
| ) | |
| if not contains_bg: | |
| return mask | |
| return resized_masks[0] | |