File size: 5,200 Bytes

d21fe9a

from typing import Dict, List, Tuple, Optional, Literal

import torch
from transformers.image_processing_utils import BaseImageProcessor, BatchFeature
from torchvision.transforms import ToTensor, Normalize
from rfdetr.util.misc import nested_tensor_from_tensor_list
from rfdetr.models.lwdetr import PostProcess


class RFDetrImageProcessor(BaseImageProcessor):
    model_input_names = ["pixel_values", "pixel_mask"]

    def __init__(

            self, 

            model_name: Literal['RFDETRBase, RFDETRLarge']='RFDETRBase',

            num_select: int=300,

            image_mean: List[int]=[0.485, 0.456, 0.406],

            image_std: List[int]=[0.229, 0.224, 0.225],

            **kwargs

        ):
        super().__init__(**kwargs)
        self.model_name = model_name
        self.config = {
            'image_mean': image_mean,
            'image_std': image_std,
        }
        self.post_process_config = {
            'num_select': num_select,
        }

    def post_process_object_detection(

            self,

            outputs,

            target_sizes: List[Tuple],

            **kwargs

        ) -> List[Dict[str, torch.Tensor]]:
        """

        Parameters

        ----------

        outputs: 

            outputs from model loaded with AutoModelForObjectDetection or ONNX model

        target_sizes: list[tuple]

            original sizes of the images. 

        """
        if isinstance(outputs, list): ### Handle ONNX outputs
            logits = torch.tensor(outputs[0])
            pred_boxes = torch.tensor(outputs[1])
        else:
            logits = outputs.logits
            pred_boxes = outputs.pred_boxes
        
        outputs = {
            'pred_logits': logits,
            'pred_boxes': pred_boxes,
        }

        # using rfdetr's postprocess class
        post_process = PostProcess(self.post_process_config['num_select'])
        detections = post_process(
            outputs,
            target_sizes=target_sizes,
        )

        return detections

    def convert_and_validate_boxes(self, annotations, images):
        for ann, img in zip(annotations, images):
            # convert from COCO format [x_min, y_min, width, height] to [cx, cy, w, h]
            boxes = ann["boxes"].to(torch.float32)
            boxes[:, [0,1]] += boxes[:, [2,3]] / 2
            ann["boxes"] = boxes

            torch._assert(isinstance(boxes, torch.Tensor), "Expected target boxes to be of type Tensor.")
            torch._assert(
                len(boxes.shape) == 2 and boxes.shape[-1] == 4,
                "Expected target boxes to be a tensor of shape [N, 4].",
            )
            for box in boxes:
                torch._assert(
                    box[2]/2 <= box[0] <= img.shape[2] - box[2]/2 and box[3]/2 <= box[1] <= img.shape[1] - box[3]/2,
                    "Expected w/2 <= x1 <= W - w/2 and h/2 <= cy <= H - h/2.",
                )

    def preprocess(

            self, 

            images, 

            annotations=None,

        ) -> BatchFeature:
        """

        Parameters

        ----------

        images: List[PIL.Image.Image]

            a single or a list of PIL images

        annotations: Optional[List[Dict[str, torch.Tensor | List]]] 

            List of annotations associated with the image or batch of images. If annotation is for object

            detection, the annotations should be a dictionary with the following keys:

            - boxes (FloatTensor[N, 4]): the ground-truth boxes COCO format [x_min, y_min, width, height]

            - class_labels (Int64Tensor[N]): the class label for each ground-truth box

        """
        totensor = ToTensor() 
        normalize = Normalize(mean=self.config['image_mean'], std=self.config['image_std'])

        if images is not None and not isinstance(images, list):
            images = list(images)
        if not isinstance(images[0], torch.Tensor):
            images = [totensor(img) for img in images]  
        if annotations is not None:
            self.convert_and_validate_boxes(annotations, images)

        # get the original image sizes
        original_image_sizes: List[Tuple[int, int]] = []
        for img in images:
            val = img.shape[-2:]
            torch._assert(
                len(val) == 2,
                f"expecting the last two dimensions of the Tensor to be H and W instead got {img.shape[-2:]}",
            )
            original_image_sizes.append((val[0], val[1])) 
        target_sizes = torch.tensor(original_image_sizes)

        # transform the input
        # normalize image
        images = [normalize(img) for img in images]
        # pad the list of images to make a tensor of size [B, C, H, W] and [B, H, W]
        nested_tensor = nested_tensor_from_tensor_list(images)

        data = {
            'pixel_values': nested_tensor.tensors, 
            'pixel_mask': nested_tensor.mask, 
            'target_sizes': target_sizes, 
            'labels': annotations
        }
        return BatchFeature(data=data)
    
    
__all__ = [
    "RFDetrImageProcessor"
]