#Copied From https://github.com/microsoft/DAViD/blob/main/runtime/utils.py """Utility classes and functions for image processing and ROI operations. Copyright (c) Microsoft Corporation. MIT License Permission is hereby granted, free of charge, to any person obtaining a copy of this software and associated documentation files (the "Software"), to deal in the Software without restriction, including without limitation the rights to use, copy, modify, merge, publish, distribute, sublicense, and/or sell copies of the Software, and to permit persons to whom the Software is furnished to do so, subject to the following conditions: The above copyright notice and this permission notice shall be included in all copies or substantial portions of the Software. THE SOFTWARE IS PROVIDED *AS IS*, WITHOUT WARRANTY OF ANY KIND, EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE. """ import cv2 import numpy as np ONNX_EP = ["CUDAExecutionProvider", "CPUExecutionProvider"] UINT8_MAX = np.iinfo(np.uint8).max UINT16_MAX = np.iinfo(np.uint16).max class ImageFormatError(Exception): """Exception raised for invalid image formats.""" pass class ModelNotFoundError(Exception): """Exception raised when model file is not found.""" pass def preprocess_img(img: np.ndarray) -> np.ndarray: """Preprocesses a BGR image for DNN. Turning to float if not already and normalizing to [0, 1]. Normalization of uint images is done by dividing by brightest possible value (e.g. 255 for uint8). Arguments: img: The image to preprocess, can be uint8, uint16, float16, float32 or float64. Returns: The preprocessed image in np.float32 format. Raises: ImageFormatError: If the image is not three channels or not uint8, uint16, float16, float32 or float64. """ if img.ndim != 3 or img.shape[2] != 3: raise ImageFormatError("image must be 3 channels, got shape: {img.shape}") if img.dtype not in [np.uint8, np.uint16, np.float16, np.float32, np.float64]: # noqa: PLR6201 raise ImageFormatError("image must be uint8 or float16, float32, float64") if img.dtype == np.uint8: img = img.astype(np.float32) / UINT8_MAX if img.dtype == np.uint16: img = img.astype(np.float32) / UINT16_MAX img = np.clip(img, 0, 1) return img.astype(np.float32) def prepare_image_for_model(image: np.ndarray, roi_size: int = 512) -> tuple[np.ndarray, dict]: """Prepare any input image for model inference by resizing to roi_size x roi_size. This function takes an image of any size and prepares it for a model that expects a square input (e.g., 512x512). It handles aspect ratio preservation by padding with replicated border values. Args: image: Input image of any size roi_size: Target size for the model (default 512) Returns: tuple: (preprocessed_image, metadata_dict) - preprocessed_image: Image resized to roi_size x roi_size - metadata_dict: Contains information needed to composite back to original size """ # Get original shape original_shape = image.shape[:2] # (height, width) # Calculate padding to make the image square if original_shape[0] < original_shape[1]: pad_h = (original_shape[1] - original_shape[0]) // 2 pad_w = 0 pad_h_extra = original_shape[1] - original_shape[0] - pad_h pad_w_extra = 0 elif original_shape[0] > original_shape[1]: pad_w = (original_shape[0] - original_shape[1]) // 2 pad_h = 0 pad_w_extra = original_shape[0] - original_shape[1] - pad_w pad_h_extra = 0 else: pad_h = pad_w = pad_h_extra = pad_w_extra = 0 # Pad the image to make it square padded_image = cv2.copyMakeBorder( image, top=pad_h, bottom=pad_h_extra, left=pad_w, right=pad_w_extra, borderType=cv2.BORDER_REPLICATE, ) square_shape = padded_image.shape[:2] while padded_image.shape[1] > roi_size * 3 and padded_image.shape[0] > roi_size * 3: padded_image = cv2.pyrDown(padded_image) resized_image = cv2.resize(padded_image, (roi_size, roi_size), interpolation=cv2.INTER_LINEAR) metadata = { "original_shape": original_shape, "square_shape": square_shape, "original_padding": (pad_h, pad_w, pad_h_extra, pad_w_extra), } return resized_image, metadata def composite_model_output_to_image( model_output: np.ndarray, metadata: dict, interp_mode: int = cv2.INTER_NEAREST ) -> np.ndarray: """Composite model output back to the original image size. Takes the model output (which should be roi_size x roi_size) and composites it back to the original image dimensions using the metadata from prepare_image_for_model. Args: model_output: Output from the model (roi_size x roi_size) metadata: Metadata dict returned from prepare_image_for_model interp_mode: Interpolation mode for resizing (default INTER_NEAREST for discrete outputs) Returns: np.ndarray: Output composited to original image size """ pad_h, pad_w, pad_h_extra, pad_w_extra = metadata["original_padding"] # Resize the entire model output back to the square shape square_shape = metadata["square_shape"] resized_to_square = cv2.resize(model_output, (square_shape[1], square_shape[0]), interpolation=interp_mode) # Remove the padding to get back to original dimensions if pad_h > 0 or pad_h_extra > 0: final_output = resized_to_square[pad_h : square_shape[0] - pad_h_extra, :] elif pad_w > 0 or pad_w_extra > 0: final_output = resized_to_square[:, pad_w : square_shape[1] - pad_w_extra] else: final_output = resized_to_square return final_output