File size: 5,298 Bytes
a249588
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
# Copyright (c) OpenMMLab. All rights reserved.
from typing import Optional

import numpy as np

from mmpose.registry import KEYPOINT_CODECS
from mmpose.structures import bbox_cs2xyxy, bbox_xyxy2cs
from .base import BaseKeypointCodec


@KEYPOINT_CODECS.register_module()
class EDPoseLabel(BaseKeypointCodec):
    r"""Generate keypoint and label coordinates for `ED-Pose`_ by
    Yang J. et al (2023).

    Note:

        - instance number: N
        - keypoint number: K
        - keypoint dimension: D
        - image size: [w, h]

    Encoded:

        - keypoints (np.ndarray): Keypoint coordinates in shape (N, K, D)
        - keypoints_visible (np.ndarray): Keypoint visibility in shape
                (N, K, D)
        - area (np.ndarray): Area in shape (N)
        - bbox (np.ndarray): Bbox in shape (N, 4)

    Args:
        num_select (int): The number of candidate instances
        num_keypoints (int): The Number of keypoints
    """

    auxiliary_encode_keys = {'area', 'bboxes', 'img_shape'}
    instance_mapping_table = dict(
        bbox='bboxes',
        keypoints='keypoints',
        keypoints_visible='keypoints_visible',
        area='areas',
    )

    def __init__(self, num_select: int = 100, num_keypoints: int = 17):
        super().__init__()

        self.num_select = num_select
        self.num_keypoints = num_keypoints

    def encode(
        self,
        img_shape,
        keypoints: np.ndarray,
        keypoints_visible: Optional[np.ndarray] = None,
        area: Optional[np.ndarray] = None,
        bboxes: Optional[np.ndarray] = None,
    ) -> dict:
        """Encoding keypoints, area and bbox from input image space to
        normalized space.

        Args:
            - img_shape (Sequence[int]): The shape of image in the format
                of (width, height).
            - keypoints (np.ndarray): Keypoint coordinates in
                shape (N, K, D).
            - keypoints_visible (np.ndarray): Keypoint visibility in shape
                (N, K)
            - area (np.ndarray):
            - bboxes (np.ndarray):

        Returns:
            encoded (dict): Contains the following items:

                - keypoint_labels (np.ndarray): The processed keypoints in
                    shape like (N, K, D).
                - keypoints_visible (np.ndarray): Keypoint visibility in shape
                    (N, K, D)
                - area_labels (np.ndarray): The processed target
                    area in shape (N).
                - bboxes_labels: The processed target bbox in
                    shape (N, 4).
        """
        w, h = img_shape

        if keypoints_visible is None:
            keypoints_visible = np.ones(keypoints.shape[:2], dtype=np.float32)

        if bboxes is not None:
            bboxes = np.concatenate(bbox_xyxy2cs(bboxes), axis=-1)
            bboxes = bboxes / np.array([w, h, w, h], dtype=np.float32)

        if area is not None:
            area = area / float(w * h)

        if keypoints is not None:
            keypoints = keypoints / np.array([w, h], dtype=np.float32)

        encoded = dict(
            keypoints=keypoints,
            area=area,
            bbox=bboxes,
            keypoints_visible=keypoints_visible)

        return encoded

    def decode(self, input_shapes: np.ndarray, pred_logits: np.ndarray,
               pred_boxes: np.ndarray, pred_keypoints: np.ndarray):
        """Select the final top-k keypoints, and decode the results from
        normalize size to origin input size.

        Args:
            input_shapes (Tensor): The size of input image resize.
            test_cfg (ConfigType): Config of testing.
            pred_logits (Tensor): The result of score.
            pred_boxes (Tensor): The result of bbox.
            pred_keypoints (Tensor): The result of keypoints.

        Returns:
            tuple: Decoded boxes, keypoints, and keypoint scores.
        """

        # Initialization
        num_keypoints = self.num_keypoints
        prob = pred_logits.reshape(-1)

        # Select top-k instances based on prediction scores
        topk_indexes = np.argsort(-prob)[:self.num_select]
        topk_values = np.take_along_axis(prob, topk_indexes, axis=0)
        scores = np.tile(topk_values[:, np.newaxis], [1, num_keypoints])

        # Decode bounding boxes
        topk_boxes = topk_indexes // pred_logits.shape[1]
        boxes = bbox_cs2xyxy(*np.split(pred_boxes, [2], axis=-1))
        boxes = np.take_along_axis(
            boxes, np.tile(topk_boxes[:, np.newaxis], [1, 4]), axis=0)

        # Convert from relative to absolute coordinates
        img_h, img_w = np.split(input_shapes, 2, axis=0)
        scale_fct = np.hstack([img_w, img_h, img_w, img_h])
        boxes = boxes * scale_fct[np.newaxis, :]

        # Decode keypoints
        topk_keypoints = topk_indexes // pred_logits.shape[1]
        keypoints = np.take_along_axis(
            pred_keypoints,
            np.tile(topk_keypoints[:, np.newaxis], [1, num_keypoints * 3]),
            axis=0)
        keypoints = keypoints[:, :(num_keypoints * 2)]
        keypoints = keypoints * np.tile(
            np.hstack([img_w, img_h]), [num_keypoints])[np.newaxis, :]
        keypoints = keypoints.reshape(-1, num_keypoints, 2)

        return boxes, keypoints, scores