image_segmentation_efficientsam / efficientSAM.py

Zhang-Yang-Sustech

Add multi-points input, foreground/background points input and box input to EfficientSAM model (#291)

bf92df0 2 months ago

5.2 kB

	import numpy as np
	import cv2 as cv

	class EfficientSAM:
	def __init__(self, modelPath, backendId=0, targetId=0):
	self._modelPath = modelPath
	self._backendId = backendId
	self._targetId = targetId

	self._model = cv.dnn.readNet(self._modelPath)
	self._model.setPreferableBackend(self._backendId)
	self._model.setPreferableTarget(self._targetId)
	# 3 inputs
	self._inputNames = ["batched_images", "batched_point_coords", "batched_point_labels"]

	self._outputNames = ['output_masks', 'iou_predictions'] # actual output layer name
	self._currentInputSize = None
	self._inputSize = [1024, 1024] # input size for the model
	self._maxPointNums = 6
	self._frontGroundPoints = []
	self._backGroundPoints = []
	self._labels = []

	@property
	def name(self):
	return self.__class__.__name__

	def setBackendAndTarget(self, backendId, targetId):
	self._backendId = backendId
	self._targetId = targetId
	self._model.setPreferableBackend(self._backendId)
	self._model.setPreferableTarget(self._targetId)

	def _preprocess(self, image, points, labels):

	image = cv.cvtColor(image, cv.COLOR_BGR2RGB)
	# record the input image size, (width, height)
	self._currentInputSize = (image.shape[1], image.shape[0])

	image = cv.resize(image, self._inputSize)

	image = image.astype(np.float32, copy=False) / 255.0

	image_blob = cv.dnn.blobFromImage(image)

	points = np.array(points, dtype=np.float32)
	labels = np.array(labels, dtype=np.float32)
	assert points.shape[0] <= self._maxPointNums, f"Max input points number: {self._maxPointNums}"
	assert points.shape[0] == labels.shape[0]

	frontGroundPoints = []
	backGroundPoints = []
	inputLabels = []
	for i in range(len(points)):
	if labels[i] == -1:
	backGroundPoints.append(points[i])
	else:
	frontGroundPoints.append(points[i])
	inputLabels.append(labels[i])
	self._backGroundPoints = np.uint32(backGroundPoints)
	# print("input:")
	# print(" back: ", self._backGroundPoints)
	# print(" front: ", frontGroundPoints)
	# print(" label: ", inputLabels)

	# convert points to (1024*1024) size space
	for p in frontGroundPoints:
	p[0] = np.float32(p[0] * self._inputSize[0]/self._currentInputSize[0])
	p[1] = np.float32(p[1] * self._inputSize[1]/self._currentInputSize[1])

	if len(frontGroundPoints) > self._maxPointNums:
	return "no"

	pad_num = self._maxPointNums - len(frontGroundPoints)
	self._frontGroundPoints = np.vstack([frontGroundPoints, np.zeros((pad_num, 2), dtype=np.float32)])
	inputLabels_arr = np.array(inputLabels, dtype=np.float32).reshape(-1, 1)
	self._labels = np.vstack([inputLabels_arr, np.full((pad_num, 1), -1, dtype=np.float32)])

	points_blob = np.array([[self._frontGroundPoints]])

	labels_blob = np.array([[self._labels]])

	return image_blob, points_blob, labels_blob

	def infer(self, image, points, labels):
	# Preprocess
	imageBlob, pointsBlob, labelsBlob = self._preprocess(image, points, labels)
	# Forward
	self._model.setInput(imageBlob, self._inputNames[0])
	self._model.setInput(pointsBlob, self._inputNames[1])
	self._model.setInput(labelsBlob, self._inputNames[2])
	# print("infering...")
	outputs = self._model.forward(self._outputNames)
	outputBlob, outputIou = outputs[0], outputs[1]
	# Postprocess
	results = self._postprocess(outputBlob, outputIou)
	# print("done")
	return results

	def _postprocess(self, outputBlob, outputIou):
	# The masks are already sorted by their predicted IOUs.
	# The first dimension is the batch size (we have a single image. so it is 1).
	# The second dimension is the number of masks we want to generate
	# The third dimension is the number of candidate masks output by the model.
	masks = outputBlob[0, 0, :, :, :] >= 0
	ious = outputIou[0, 0, :]

	# sorted by ious
	sorted_indices = np.argsort(ious)[::-1]
	sorted_masks = masks[sorted_indices]

	# sorted by area
	# mask_areas = np.sum(masks, axis=(1, 2))
	# sorted_indices = np.argsort(mask_areas)
	# sorted_masks = masks[sorted_indices]

	masks_uint8 = (sorted_masks * 255).astype(np.uint8)

	# change to real image size
	resized_masks = [
	cv.resize(mask, dsize=self._currentInputSize,
	interpolation=cv.INTER_NEAREST)
	for mask in masks_uint8
	]

	# background mask don't need
	for mask in resized_masks:
	contains_bg = any(
	mask[y, x] if (0 <= x < mask.shape[1] and 0 <= y < mask.shape[0])
	else False
	for (x, y) in self._backGroundPoints
	)
	if not contains_bg:
	return mask

	return resized_masks[0]