Spaces:

purkrmir
/

BBoxMaskPose-demo

Running on Zero

BBoxMaskPose-demo / mmpose /models /heads /hybrid_heads /calibration_head.py

Miroslav Purkrabek

add code

a249588 25 days ago

49.4 kB

	# Copyright (c) OpenMMLab. All rights reserved.
	from typing import Optional, Sequence, Tuple, Union

	import torch
	from mmcv.cnn import build_conv_layer, build_upsample_layer
	from mmengine.structures import PixelData
	from torch import Tensor, nn

	from mmpose.evaluation.functional import pose_pck_accuracy
	from mmpose.models.utils.tta import flip_heatmaps
	from mmpose.registry import KEYPOINT_CODECS, MODELS
	from mmpose.utils.tensor_utils import to_numpy
	from mmpose.utils.typing import (ConfigType, Features, OptConfigType,
	OptSampleList, Predictions)
	from ..base_head import BaseHead

	import numpy as np

	from sparsemax import Sparsemax

	import os
	import shutil
	import cv2

	from mmpose.structures.keypoint import fix_bbox_aspect_ratio

	OptIntSeq = Optional[Sequence[int]]


	@MODELS.register_module()
	class CalibrationHead(BaseHead):
	"""Multi-variate head predicting all information about keypoints. Apart
	from the heatmap, it also predicts:
	1) Heatmap for each keypoint
	2) Probability of keypoint being in the heatmap
	3) Visibility of each keypoint
	4) Predicted OKS per keypoint
	5) Predictd euclidean error per keypoint
	The heatmap predicting part is the same as HeatmapHead introduced in
	in `Simple Baselines`_ by Xiao et al (2018).

	Args:
	in_channels (int \| Sequence[int]): Number of channels in the input
	feature map
	out_channels (int): Number of channels in the output heatmap
	deconv_out_channels (Sequence[int], optional): The output channel
	number of each deconv layer. Defaults to ``(256, 256, 256)``
	deconv_kernel_sizes (Sequence[int \| tuple], optional): The kernel size
	of each deconv layer. Each element should be either an integer for
	both height and width dimensions, or a tuple of two integers for
	the height and the width dimension respectively.Defaults to
	``(4, 4, 4)``
	conv_out_channels (Sequence[int], optional): The output channel number
	of each intermediate conv layer. ``None`` means no intermediate
	conv layer between deconv layers and the final conv layer.
	Defaults to ``None``
	conv_kernel_sizes (Sequence[int \| tuple], optional): The kernel size
	of each intermediate conv layer. Defaults to ``None``
	final_layer_dict (dict): Arguments of the final Conv2d layer.
	Defaults to ``dict(kernel_size=1)``
	keypoint_loss (Config): Config of the keypoint loss. Defaults to use
	:class:`KeypointMSELoss`
	probability_loss (Config): Config of the probability loss. Defaults to use
	:class:`BCELoss`
	visibility_loss (Config): Config of the visibility loss. Defaults to use
	:class:`BCELoss`
	oks_loss (Config): Config of the oks loss. Defaults to use
	:class:`MSELoss`
	error_loss (Config): Config of the error loss. Defaults to use
	:class:`L1LogLoss`
	normalize (bool): Whether to normalize values in the heatmaps between
	0 and 1 with sigmoid. Defaults to ``False``
	detach_probability (bool): Whether to detach the probability
	from gradient computation. Defaults to ``True``
	detach_visibility (bool): Whether to detach the visibility
	from gradient computation. Defaults to ``True``
	learn_heatmaps_from_zeros (bool): Whether to learn the
	heatmaps from zeros. Defaults to ``False``
	freeze_heatmaps (bool): Whether to freeze the heatmaps prediction.
	Defaults to ``False``
	freeze_probability (bool): Whether to freeze the probability prediction.
	Defaults to ``False``
	freeze_visibility (bool): Whether to freeze the visibility prediction.
	Defaults to ``False``
	freeze_oks (bool): Whether to freeze the oks prediction.
	Defaults to ``False``
	freeze_error (bool): Whether to freeze the error prediction.
	Defaults to ``False``
	decoder (Config, optional): The decoder config that controls decoding
	keypoint coordinates from the network output. Defaults to ``None``
	init_cfg (Config, optional): Config to control the initialization. See
	:attr:`default_init_cfg` for default settings


	.. _`Simple Baselines`: https://arxiv.org/abs/1804.06208
	"""

	_version = 2

	def __init__(self,
	in_channels: Union[int, Sequence[int]],
	out_channels: int,
	deconv_out_channels: OptIntSeq = (256, 256, 256),
	deconv_kernel_sizes: OptIntSeq = (4, 4, 4),
	conv_out_channels: OptIntSeq = None,
	conv_kernel_sizes: OptIntSeq = None,
	final_layer_dict: dict = dict(kernel_size=1),
	keypoint_loss: ConfigType = dict(
	type='KeypointMSELoss', use_target_weight=True),
	probability_loss: ConfigType = dict(
	type='BCELoss', use_target_weight=True),
	visibility_loss: ConfigType = dict(
	type='BCELoss', use_target_weight=True),
	oks_loss: ConfigType = dict(
	type='MSELoss', use_target_weight=True),
	error_loss: ConfigType = dict(
	type='L1LogLoss', use_target_weight=True),
	normalize: float = None,
	detach_probability: bool = True,
	detach_visibility: bool = True,
	learn_heatmaps_from_zeros: bool = False,
	freeze_heatmaps: bool = False,
	freeze_probability: bool = False,
	freeze_visibility: bool = False,
	freeze_oks: bool = False,
	freeze_error: bool = False,
	decoder: OptConfigType = dict(
	type='UDPHeatmap', input_size=(192, 256),
	heatmap_size=(48, 64), sigma=2),
	init_cfg: OptConfigType = None,
	):

	if init_cfg is None:
	init_cfg = self.default_init_cfg

	super().__init__(init_cfg)

	self.in_channels = in_channels
	self.out_channels = out_channels
	self.keypoint_loss_module = MODELS.build(keypoint_loss)
	self.probability_loss_module = MODELS.build(probability_loss)
	self.visibility_loss_module = MODELS.build(visibility_loss)
	self.oks_loss_module = MODELS.build(oks_loss)
	self.error_loss_module = MODELS.build(error_loss)

	self.temperature = torch.nn.Parameter(torch.tensor(1.0), requires_grad=True)

	self.gauss_sigma = 2.0
	self.gauss_kernel_size = int(2.0 * 3.0 * self.gauss_sigma + 1.0)
	ts = torch.linspace(
	- self.gauss_kernel_size // 2,
	self.gauss_kernel_size // 2,
	self.gauss_kernel_size
	)
	gauss = torch.exp(-(ts / self.gauss_sigma)**2 / 2)
	gauss = gauss / gauss.sum()
	self.gauss_kernel = gauss.unsqueeze(0) * gauss.unsqueeze(1)

	self.decoder = KEYPOINT_CODECS.build(decoder)
	self.nonlinearity = nn.ReLU(inplace=True)
	self.learn_heatmaps_from_zeros = learn_heatmaps_from_zeros

	self.num_iters = 0
	unique_hash = np.random.randint(0, 100000)
	self.loss_vis_folder = "work_dirs/loss_vis_{:05d}".format(unique_hash)
	self.interval = 50
	shutil.rmtree(self.loss_vis_folder, ignore_errors=True)
	print("Will save heatmap visualizations to folder '{:s}'".format(self.loss_vis_folder))

	self._build_heatmap_head(
	in_channels=in_channels,
	out_channels=out_channels,
	deconv_out_channels=deconv_out_channels,
	deconv_kernel_sizes=deconv_kernel_sizes,
	conv_out_channels=conv_out_channels,
	conv_kernel_sizes=conv_kernel_sizes,
	final_layer_dict=final_layer_dict,
	normalize=normalize,
	freeze=freeze_heatmaps)

	self.normalize = normalize

	self.detach_probability = detach_probability
	self._build_probability_head(
	in_channels=in_channels,
	out_channels=out_channels,
	freeze=freeze_probability)

	self.detach_visibility = detach_visibility
	self._build_visibility_head(
	in_channels=in_channels,
	out_channels=out_channels,
	freeze=freeze_visibility)

	self._build_oks_head(
	in_channels=in_channels,
	out_channels=out_channels,
	freeze=freeze_oks)
	self.freeze_oks = freeze_oks

	self._build_error_head(
	in_channels=in_channels,
	out_channels=out_channels,
	freeze=freeze_error)
	self.freeze_error = freeze_error

	# Register the hook to automatically convert old version state dicts
	self._register_load_state_dict_pre_hook(self._load_state_dict_pre_hook)

	self._freeze_all_but_temperature()

	# Print all params and their gradients
	print("\n", "="*20)
	for name, param in self.named_parameters():
	print(name, param.requires_grad)


	def _freeze_all_but_temperature(self):
	for param in self.parameters():
	param.requires_grad = False
	self.temperature.requires_grad = True

	def _build_heatmap_head(self, in_channels: int, out_channels: int,
	deconv_out_channels: Sequence[int],
	deconv_kernel_sizes: Sequence[int],
	conv_out_channels: Sequence[int],
	conv_kernel_sizes: Sequence[int],
	final_layer_dict: dict,
	normalize: bool = False,
	freeze: bool = False) -> nn.Module:
	"""Build the heatmap head module."""
	if deconv_out_channels:
	if deconv_kernel_sizes is None or len(deconv_out_channels) != len(
	deconv_kernel_sizes):
	raise ValueError(
	'"deconv_out_channels" and "deconv_kernel_sizes" should '
	'be integer sequences with the same length. Got '
	f'mismatched lengths {deconv_out_channels} and '
	f'{deconv_kernel_sizes}')

	self.deconv_layers = self._make_deconv_layers(
	in_channels=in_channels,
	layer_out_channels=deconv_out_channels,
	layer_kernel_sizes=deconv_kernel_sizes,
	)
	in_channels = deconv_out_channels[-1]
	else:
	self.deconv_layers = nn.Identity()

	if conv_out_channels:
	if conv_kernel_sizes is None or len(conv_out_channels) != len(
	conv_kernel_sizes):
	raise ValueError(
	'"conv_out_channels" and "conv_kernel_sizes" should '
	'be integer sequences with the same length. Got '
	f'mismatched lengths {conv_out_channels} and '
	f'{conv_kernel_sizes}')

	self.conv_layers = self._make_conv_layers(
	in_channels=in_channels,
	layer_out_channels=conv_out_channels,
	layer_kernel_sizes=conv_kernel_sizes)
	in_channels = conv_out_channels[-1]
	else:
	self.conv_layers = nn.Identity()

	if final_layer_dict is not None:
	cfg = dict(
	type='Conv2d',
	in_channels=in_channels,
	out_channels=out_channels,
	kernel_size=1)
	cfg.update(final_layer_dict)
	self.final_layer = build_conv_layer(cfg)
	else:
	self.final_layer = nn.Identity()
	# self.normalize_layer = lambda x: x / x.sum(dim=-1, keepdim=True) if normalize else nn.Identity()
	# self.normalize_layer = nn.Softmax(dim=-1) if normalize else nn.Identity()
	self.normalize_layer = nn.Identity() if normalize is None else Sparsemax(dim=-1)

	if freeze:
	for param in self.deconv_layers.parameters():
	param.requires_grad = False
	for param in self.conv_layers.parameters():
	param.requires_grad = False
	for param in self.final_layer.parameters():
	param.requires_grad = False

	def _build_probability_head(self, in_channels: int, out_channels: int,
	freeze: bool = False) -> nn.Module:
	"""Build the probability head module."""
	ppb_layers = []
	kernel_sizes = [(4, 3), (2, 2), (2, 2)]
	for i in range(len(kernel_sizes)):
	ppb_layers.append(
	build_conv_layer(
	dict(type='Conv2d'),
	in_channels=in_channels,
	out_channels=in_channels,
	kernel_size=3,
	stride=1,
	padding=1))
	ppb_layers.append(
	nn.BatchNorm2d(num_features=in_channels))
	ppb_layers.append(
	nn.MaxPool2d(kernel_size=kernel_sizes[i], stride=kernel_sizes[i], padding=0))
	ppb_layers.append(self.nonlinearity)
	ppb_layers.append(
	build_conv_layer(
	dict(type='Conv2d'),
	in_channels=in_channels,
	out_channels=out_channels,
	kernel_size=1,
	stride=1,
	padding=0))
	ppb_layers.append(nn.Sigmoid())
	self.probability_layers = nn.Sequential(*ppb_layers)

	if freeze:
	for param in self.probability_layers.parameters():
	param.requires_grad = False

	def _build_visibility_head(self, in_channels: int, out_channels: int,
	freeze: bool = False) -> nn.Module:
	"""Build the visibility head module."""
	vis_layers = []
	kernel_sizes = [(4, 3), (2, 2), (2, 2)]
	for i in range(len(kernel_sizes)):
	vis_layers.append(
	build_conv_layer(
	dict(type='Conv2d'),
	in_channels=in_channels,
	out_channels=in_channels,
	kernel_size=3,
	stride=1,
	padding=1))
	vis_layers.append(
	nn.BatchNorm2d(num_features=in_channels))
	vis_layers.append(
	nn.MaxPool2d(kernel_size=kernel_sizes[i], stride=kernel_sizes[i], padding=0))
	vis_layers.append(self.nonlinearity)
	vis_layers.append(
	build_conv_layer(
	dict(type='Conv2d'),
	in_channels=in_channels,
	out_channels=out_channels,
	kernel_size=1,
	stride=1,
	padding=0))
	vis_layers.append(nn.Sigmoid())
	self.visibility_layers = nn.Sequential(*vis_layers)

	if freeze:
	for param in self.visibility_layers.parameters():
	param.requires_grad = False

	def _build_oks_head(self, in_channels: int, out_channels: int,
	freeze: bool = False) -> nn.Module:
	"""Build the oks head module."""
	oks_layers = []
	kernel_sizes = [(4, 3), (2, 2), (2, 2)]
	for i in range(len(kernel_sizes)):
	oks_layers.append(
	build_conv_layer(
	dict(type='Conv2d'),
	in_channels=in_channels,
	out_channels=in_channels,
	kernel_size=3,
	stride=1,
	padding=1))
	oks_layers.append(
	nn.BatchNorm2d(num_features=in_channels))
	oks_layers.append(
	nn.MaxPool2d(kernel_size=kernel_sizes[i], stride=kernel_sizes[i], padding=0))
	oks_layers.append(self.nonlinearity)
	oks_layers.append(
	build_conv_layer(
	dict(type='Conv2d'),
	in_channels=in_channels,
	out_channels=out_channels,
	kernel_size=1,
	stride=1,
	padding=0))
	oks_layers.append(nn.Sigmoid())
	self.oks_layers = nn.Sequential(*oks_layers)

	if freeze:
	for param in self.oks_layers.parameters():
	param.requires_grad = False

	def _build_error_head(self, in_channels: int, out_channels: int,
	freeze: bool = False) -> nn.Module:
	"""Build the error head module."""
	error_layers = []
	kernel_sizes = [(4, 3), (2, 2), (2, 2)]
	for i in range(len(kernel_sizes)):
	error_layers.append(
	build_conv_layer(
	dict(type='Conv2d'),
	in_channels=in_channels,
	out_channels=in_channels,
	kernel_size=3,
	stride=1,
	padding=1))
	error_layers.append(
	nn.BatchNorm2d(num_features=in_channels))
	error_layers.append(
	nn.MaxPool2d(kernel_size=kernel_sizes[i], stride=kernel_sizes[i], padding=0))
	error_layers.append(self.nonlinearity)
	error_layers.append(
	build_conv_layer(
	dict(type='Conv2d'),
	in_channels=in_channels,
	out_channels=out_channels,
	kernel_size=1,
	stride=1,
	padding=0))
	error_layers.append(self.nonlinearity)
	self.error_layers = nn.Sequential(*error_layers)

	if freeze:
	for param in self.error_layers.parameters():
	param.requires_grad = False

	def _make_conv_layers(self, in_channels: int,
	layer_out_channels: Sequence[int],
	layer_kernel_sizes: Sequence[int]) -> nn.Module:
	"""Create convolutional layers by given parameters."""

	layers = []
	for out_channels, kernel_size in zip(layer_out_channels,
	layer_kernel_sizes):
	padding = (kernel_size - 1) // 2
	cfg = dict(
	type='Conv2d',
	in_channels=in_channels,
	out_channels=out_channels,
	kernel_size=kernel_size,
	stride=1,
	padding=padding)
	layers.append(build_conv_layer(cfg))
	layers.append(nn.BatchNorm2d(num_features=out_channels))
	layers.append(self.nonlinearity)
	in_channels = out_channels

	return nn.Sequential(*layers)

	def _make_deconv_layers(self, in_channels: int,
	layer_out_channels: Sequence[int],
	layer_kernel_sizes: Sequence[int]) -> nn.Module:
	"""Create deconvolutional layers by given parameters."""

	layers = []
	for out_channels, kernel_size in zip(layer_out_channels,
	layer_kernel_sizes):
	if kernel_size == 4:
	padding = 1
	output_padding = 0
	elif kernel_size == 3:
	padding = 1
	output_padding = 1
	elif kernel_size == 2:
	padding = 0
	output_padding = 0
	else:
	raise ValueError(f'Unsupported kernel size {kernel_size} for'
	'deconvlutional layers in '
	f'{self.__class__.__name__}')
	cfg = dict(
	type='deconv',
	in_channels=in_channels,
	out_channels=out_channels,
	kernel_size=kernel_size,
	stride=2,
	padding=padding,
	output_padding=output_padding,
	bias=False)
	layers.append(build_upsample_layer(cfg))
	layers.append(nn.BatchNorm2d(num_features=out_channels))
	layers.append(self.nonlinearity)
	in_channels = out_channels

	return nn.Sequential(*layers)

	def _error_from_heatmaps(self, gt_heatmaps: Tensor, dt_heatmaps: Tensor) -> Tensor:
	"""Calculate the error from heatmaps.

	Args:
	heatmaps (Tensor): The predicted heatmaps.

	Returns:
	Tensor: The predicted error.
	"""
	# Transform to numpy
	gt_heatmaps = to_numpy(gt_heatmaps)
	dt_heatmaps = to_numpy(dt_heatmaps)

	# Get locations from heatmaps
	B, C, H, W = gt_heatmaps.shape
	gt_coords = np.zeros((B, C, 2))
	dt_coords = np.zeros((B, C, 2))
	for i, (gt_htm, dt_htm) in enumerate(zip(gt_heatmaps, dt_heatmaps)):
	coords, score = self.decoder.decode(gt_htm)
	coords = coords.squeeze()
	gt_coords[i, :, :] = coords

	coords, score = self.decoder.decode(dt_htm)
	coords = coords.squeeze()
	dt_coords[i, :, :] = coords

	# NaN coordinates mean empty heatmaps -> set them to -1
	# as the error will be ignored by weight
	gt_coords[np.isnan(gt_coords)] = -1

	# Calculate the error
	target_errors = np.linalg.norm(gt_coords - dt_coords, axis=2)
	assert (target_errors >= 0).all(), "Euclidean distance cannot be negative"

	return target_errors

	def _oks_from_heatmaps(self, gt_heatmaps: Tensor, dt_heatmaps: Tensor, weight: Tensor) -> Tensor:
	"""Calculate the OKS from heatmaps.

	Args:
	heatmaps (Tensor): The predicted heatmaps.

	Returns:
	Tensor: The predicted OKS.
	"""
	C = dt_heatmaps.shape[1]

	# Transform to numpy
	gt_heatmaps = to_numpy(gt_heatmaps)
	dt_heatmaps = to_numpy(dt_heatmaps)
	B, C, H, W = gt_heatmaps.shape
	weight = to_numpy(weight).squeeze().reshape((B, C, 1))

	# Get locations from heatmaps
	gt_coords = np.zeros((B, C, 2))
	dt_coords = np.zeros((B, C, 2))
	for i, (gt_htm, dt_htm) in enumerate(zip(gt_heatmaps, dt_heatmaps)):
	coords, score = self.decoder.decode(gt_htm)
	coords = coords.squeeze()
	gt_coords[i, :, :] = coords

	coords, score = self.decoder.decode(dt_htm)
	coords = coords.squeeze()
	dt_coords[i, :, :] = coords

	# NaN coordinates mean empty heatmaps -> set them to 0
	gt_coords[np.isnan(gt_coords)] = 0

	# Add probability as visibility
	gt_coords = gt_coords * weight
	dt_coords = dt_coords * weight
	gt_coords = np.concatenate((gt_coords, weight*2), axis=2)
	dt_coords = np.concatenate((dt_coords, weight*2), axis=2)

	# Calculate the oks
	target_oks = []
	oks_weights = []
	for i in range(len(gt_coords)):
	gt_kpts = gt_coords[i]
	dt_kpts = dt_coords[i]
	valid_gt_kpts = gt_kpts[:, 2] > 0
	if not valid_gt_kpts.any():
	# Changed for per-keypoint OKS
	target_oks.append(np.zeros(C))
	oks_weights.append(0)
	continue

	gt_bbox = np.array([
	0, 0,
	64, 48,
	])
	gt = {
	'keypoints': gt_kpts,
	'bbox': gt_bbox,
	'area': gt_bbox[2] * gt_bbox[3],
	}
	dt = {
	'keypoints': dt_kpts,
	'bbox': gt_bbox,
	'area': gt_bbox[2] * gt_bbox[3],
	}
	# Changed for per-keypoint OKS
	oks = compute_oks(gt, dt, use_area=False, per_kpt=True)
	target_oks.append(oks)
	oks_weights.append(1)

	target_oks = np.array(target_oks)
	target_oks = torch.from_numpy(target_oks).float()

	oks_weights = np.array(oks_weights)
	oks_weights = torch.from_numpy(oks_weights).float()

	return target_oks, oks_weights

	@property
	def default_init_cfg(self):
	init_cfg = [
	dict(
	type='Normal', layer=['Conv2d', 'ConvTranspose2d'], std=0.001),
	dict(type='Constant', layer='BatchNorm2d', val=1)
	]
	return init_cfg

	def forward(self, feats: Tuple[Tensor]) -> Tuple[Tensor, Tensor, Tensor, Tensor, Tensor]:
	"""Forward the network. The input is multi scale feature maps and the
	output is (1) the heatmap, (2) probability, (3) visibility, (4) oks and (5) error.

	Args:
	feats (Tensor): Multi scale feature maps.

	Returns:
	Tuple[Tensor, Tensor, Tensor, Tensor, Tensor]: outputs.
	"""
	x = feats[-1].detach()

	heatmaps = self.forward_heatmap(x)
	probabilities = self.forward_probability(x)
	visibilities = self.forward_visibility(x)
	oks = self.forward_oks(x)
	errors = self.forward_error(x)

	return heatmaps, probabilities, visibilities, oks, errors

	def forward_heatmap(self, x: Tensor) -> Tensor:
	"""Forward the network. The input is multi scale feature maps and the
	output is the heatmap.

	Args:
	x (Tensor): Multi scale feature maps.

	Returns:
	Tensor: output heatmap.
	"""
	x = self.deconv_layers(x)
	x = self.conv_layers(x)
	x = self.final_layer(x)
	B, C, H, W = x.shape
	x = x.reshape((B, C, H*W))
	x = self.normalize_layer(x/self.temperature)
	if self.normalize is not None:
	x = x * self.normalize
	x = torch.clamp(x, 0, 1)
	x = x.reshape((B, C, H, W))

	# # Blur the heatmaps with Gaussian
	# x = x.reshape((B*C, 1, H, W))
	# x = nn.functional.conv2d(x, self.gauss_kernel[None, None, :, :].to(x.device), padding='same')
	# x = x.reshape((B, C, H, W))

	return x

	def forward_probability(self, x: Tensor) -> Tensor:
	"""Forward the network. The input is multi scale feature maps and the
	output is the probability.

	Args:
	x (Tensor): Multi scale feature maps.
	detach (bool): Whether to detach the probability from gradient

	Returns:
	Tensor: output probability.
	"""
	if self.detach_probability:
	x = x.detach()
	x = self.probability_layers(x)
	return x

	def forward_visibility(self, x: Tensor) -> Tensor:
	"""Forward the network. The input is multi scale feature maps and the
	output is the visibility.

	Args:
	x (Tensor): Multi scale feature maps.
	detach (bool): Whether to detach the visibility from gradient

	Returns:
	Tensor: output visibility.
	"""
	if self.detach_visibility:
	x = x.detach()
	x = self.visibility_layers(x)
	return x

	def forward_oks(self, x: Tensor) -> Tensor:
	"""Forward the network. The input is multi scale feature maps and the
	output is the oks.

	Args:
	x (Tensor): Multi scale feature maps.

	Returns:
	Tensor: output oks.
	"""
	x = x.detach()
	x = self.oks_layers(x)
	return x

	def forward_error(self, x: Tensor) -> Tensor:
	"""Forward the network. The input is multi scale feature maps and the
	output is the euclidean error.

	Args:
	x (Tensor): Multi scale feature maps.

	Returns:
	Tensor: output error.
	"""
	x = x.detach()
	x = self.error_layers(x)
	return x

	def predict(self,
	feats: Features,
	batch_data_samples: OptSampleList,
	test_cfg: ConfigType = {}) -> Predictions:
	"""Predict results from features.

	Args:
	feats (Tuple[Tensor] \| List[Tuple[Tensor]]): The multi-stage
	features (or multiple multi-stage features in TTA)
	batch_data_samples (List[:obj:`PoseDataSample`]): The batch
	data samples
	test_cfg (dict): The runtime config for testing process. Defaults
	to {}

	Returns:
	Union[InstanceList \| Tuple[InstanceList \| PixelDataList]]: If
	``test_cfg['output_heatmap']==True``, return both pose and heatmap
	prediction; otherwise only return the pose prediction.

	The pose prediction is a list of ``InstanceData``, each contains
	the following fields:

	- keypoints (np.ndarray): predicted keypoint coordinates in
	shape (num_instances, K, D) where K is the keypoint number
	and D is the keypoint dimension
	- keypoint_scores (np.ndarray): predicted keypoint scores in
	shape (num_instances, K)

	The heatmap prediction is a list of ``PixelData``, each contains
	the following fields:

	- heatmaps (Tensor): The predicted heatmaps in shape (K, h, w)
	"""

	if test_cfg.get('flip_test', False):
	# TTA: flip test -> feats = [orig, flipped]
	assert isinstance(feats, list) and len(feats) == 2
	flip_indices = batch_data_samples[0].metainfo['flip_indices']
	_feats, _feats_flip = feats

	_htm, _prob, _vis, _oks, _err = self.forward(_feats)
	_htm_flip, _prob_flip, _vis_flip, _oks_flip, _err_flip = self.forward(_feats_flip)
	B, C, H, W = _htm.shape

	# Flip back the keypoints
	_htm_flip = flip_heatmaps(
	_htm_flip,
	flip_mode=test_cfg.get('flip_mode', 'heatmap'),
	flip_indices=flip_indices,
	shift_heatmap=test_cfg.get('shift_heatmap', False))
	heatmaps = (_htm + _htm_flip) * 0.5

	# Flip back scalars
	_prob_flip = _prob_flip[:, flip_indices]
	_vis_flip = _vis_flip[:, flip_indices]
	_oks_flip = _oks_flip[:, flip_indices]
	_err_flip = _err_flip[:, flip_indices]

	probabilities = (_prob + _prob_flip) * 0.5
	visibilities = (_vis + _vis_flip) * 0.5
	oks = (_oks + _oks_flip) * 0.5
	errors = (_err + _err_flip) * 0.5
	else:
	heatmaps, probabilities, visibilities, oks, errors = self.forward(feats)
	B, C, H, W = heatmaps.shape

	preds = self.decode(heatmaps)
	probabilities = to_numpy(probabilities).reshape((B, 1, C))
	visibilities = to_numpy(visibilities).reshape((B, 1, C))
	oks = to_numpy(oks).reshape((B, 1, C))
	errors = to_numpy(errors).reshape((B, 1, C))

	# Normalize errors by dividing with the diagonal of the heatmap
	htm_diagonal = np.sqrt(H2 + W2)
	errors = errors / htm_diagonal

	for pi, p in enumerate(preds):
	p.set_field(p['keypoint_scores'], "keypoints_conf")
	p.set_field(probabilities[pi], "keypoints_probs")
	p.set_field(visibilities[pi], "keypoints_visible")
	p.set_field(oks[pi], "keypoints_oks")
	p.set_field(errors[pi], "keypoints_error")

	# Replace the keypoint scores with OKS/errors
	if not self.freeze_oks:
	p.set_field(oks[pi], "keypoint_scores")
	# p.set_field(1-errors[pi], "keypoint_scores")

	# hm = heatmaps.detach().cpu().numpy()
	# print("Heatmaps:", hm.shape, hm.min(), hm.max())

	if test_cfg.get('output_heatmaps', False):
	pred_fields = [
	PixelData(heatmaps=hm) for hm in heatmaps.detach()
	]
	return preds, pred_fields
	else:
	return preds

	def loss(self,
	feats: Tuple[Tensor],
	batch_data_samples: OptSampleList,
	train_cfg: ConfigType = {}) -> dict:
	"""Calculate losses from a batch of inputs and data samples.

	Args:
	feats (Tuple[Tensor]): The multi-stage features
	batch_data_samples (List[:obj:`PoseDataSample`]): The batch
	data samples
	train_cfg (dict): The runtime config for training process.
	Defaults to {}

	Returns:
	dict: A dictionary of losses.
	"""
	dt_heatmaps, dt_probs, dt_vis, dt_oks, dt_errs = self.forward(feats)
	device=dt_heatmaps.device
	B, C, H, W = dt_heatmaps.shape

	# Extract GT data
	gt_heatmaps = torch.stack(
	[d.gt_fields.heatmaps for d in batch_data_samples])
	gt_probs = np.stack(
	[d.gt_instances.in_image.astype(int) for d in batch_data_samples])
	gt_annotated = np.stack(
	[d.gt_instances.keypoints_visible.astype(int) for d in batch_data_samples])
	gt_vis = np.stack(
	[d.gt_instances.keypoints_visibility.astype(int) for d in batch_data_samples])
	keypoint_weights = torch.cat([
	d.gt_instance_labels.keypoint_weights for d in batch_data_samples
	])

	# Compute GT errors and OKS
	if self.freeze_error:
	gt_errs = torch.zeros((B, C, 1), device=device, dtype=dt_errs.dtype)
	else:
	gt_errs = self._error_from_heatmaps(gt_heatmaps, dt_heatmaps)
	if self.freeze_oks:
	gt_oks = torch.zeros((B, C, 1), device=device, dtype=dt_oks.dtype)
	oks_weight = torch.zeros((B, C, 1), device=device, dtype=dt_oks.dtype)
	else:
	gt_oks, oks_weight = self._oks_from_heatmaps(
	gt_heatmaps,
	dt_heatmaps,
	gt_probs & gt_annotated,
	)

	# Convert everything to tensors
	gt_probs = torch.tensor(gt_probs, device=device, dtype=dt_probs.dtype)
	gt_vis = torch.tensor(gt_vis, device=device, dtype=dt_vis.dtype)
	gt_annotated = torch.tensor(gt_annotated, device=device)

	gt_oks = gt_oks.to(device).to(dt_oks.dtype)
	oks_weight = oks_weight.to(device).to(dt_oks.dtype)
	gt_errs = gt_errs.to(device).to(dt_errs.dtype)

	# Reshape everything to comparable shapes
	gt_heatmaps = gt_heatmaps.view((B, C, H, W))
	dt_heatmaps = dt_heatmaps.view((B, C, H, W))
	gt_probs = gt_probs.view((B, C))
	dt_probs = dt_probs.view((B, C))
	gt_vis = gt_vis.view((B, C))
	dt_vis = dt_vis.view((B, C))
	gt_oks = gt_oks.view((B, C))
	dt_oks = dt_oks.view((B, C))
	gt_errs = gt_errs.view((B, C))
	dt_errs = dt_errs.view((B, C))
	keypoint_weights = keypoint_weights.view((B, C))
	gt_annotated = gt_annotated.view((B, C))
	# oks_weight = oks_weight.view((B, C))

	annotated_in = gt_annotated & (gt_probs > 0.5)

	# calculate losses
	losses = dict()
	if self.learn_heatmaps_from_zeros:
	heatmap_weights = gt_annotated
	else:
	# heatmap_weights = keypoint_weights
	heatmap_weights = annotated_in

	heatmap_loss_pxl = self.keypoint_loss_module(dt_heatmaps, gt_heatmaps, annotated_in, per_pixel=True)
	heatmap_loss = self.keypoint_loss_module(dt_heatmaps, gt_heatmaps, annotated_in)
	# probability_loss = self.probability_loss_module(dt_probs, gt_probs, gt_annotated)
	# visibility_loss = self.visibility_loss_module(dt_vis, gt_vis, annotated_in)
	# oks_loss = self.oks_loss_module(dt_oks, gt_oks, annotated_in)
	# error_loss = self.error_loss_module(dt_errs, gt_errs, annotated_in)

	# Visualize some heatmaps
	for i in range(0, B):
	# continue
	if self.num_iters % self.interval == 0:
	self.interval = int(self.interval * 1.3)
	os.makedirs(self.loss_vis_folder, exist_ok=True)
	for kpt_i in np.random.choice(C, 17, replace=False):
	tgt = gt_heatmaps[i, kpt_i].detach().cpu().numpy()
	htm = dt_heatmaps[i, kpt_i].detach().cpu().numpy()
	lss = heatmap_loss_pxl[i, kpt_i].detach().cpu().numpy()
	save_img = self._visualize_heatmaps(
	htm, tgt, lss, keypoint_weights[i, kpt_i], gt_probs[i, kpt_i]
	)

	save_path = os.path.join(
	self.loss_vis_folder,
	"heatmap_{:07d}-{:d}-{:d}.png".format(self.num_iters, i, kpt_i)
	)
	cv2.imwrite(save_path, save_img)

	self.num_iters += 1


	losses.update(
	loss_kpt=heatmap_loss
	)

	# calculate accuracy
	if train_cfg.get('compute_acc', True):
	acc_pose = self.get_pose_accuracy(
	dt_heatmaps, gt_heatmaps, keypoint_weights > 0.5
	)
	losses.update(acc_pose=acc_pose)

	# Calculate the best binary accuracy for probability
	acc_prob, _ = self.get_binary_accuracy(
	dt_probs,
	gt_probs,
	gt_annotated > 0.5,
	force_balanced=True,
	)
	losses.update(acc_prob=acc_prob)

	# Calculate the best binary accuracy for visibility
	acc_vis, _ = self.get_binary_accuracy(
	dt_vis,
	gt_vis,
	annotated_in > 0.5,
	force_balanced=True,
	)
	losses.update(acc_vis=acc_vis)

	# Calculate the MAE for OKS
	acc_oks = self.get_mae(
	dt_oks,
	gt_oks,
	annotated_in > 0.5,
	)
	losses.update(mae_oks=acc_oks)

	# Calculate the MAE for euclidean error
	acc_err = self.get_mae(
	dt_errs,
	gt_errs,
	annotated_in > 0.5,
	)
	losses.update(mae_err=acc_err)

	# Calculate the MAE between Euclidean error and OKS
	err_to_oks_mae = self.get_mae(
	self.error_to_OKS(dt_errs, area=H*W),
	gt_oks,
	annotated_in > 0.5,
	)
	losses.update(mae_err_to_oks=err_to_oks_mae)

	print(self.temperature.item())

	return losses

	def _visualize_heatmaps(
	self,
	htm,
	tgt,
	lss,
	weight,
	prob
	):
	tgt_range = (tgt.min(), tgt.max())
	htm_range = (htm.min(), htm.max())
	lss_range = (lss.min(), lss.max())

	tgt[tgt < 0] = 0
	htm[htm < 0] = 0
	lss[lss < 0] = 0

	# Normalize heatmaps between 0 and 1
	tgt /= (tgt.max()+1e-10)
	htm /= (htm.max()+1e-10)
	lss /= (lss.max()+1e-10)

	scale = 6

	htm_color = cv2.cvtColor((htm*255).astype(np.uint8), cv2.COLOR_GRAY2BGR)
	htm_color = cv2.applyColorMap(htm_color, cv2.COLORMAP_JET)
	htm_color = cv2.resize(htm_color, (htm.shape[1]scale, htm.shape[0]scale), interpolation=cv2.INTER_NEAREST)

	tgt_color = cv2.cvtColor((tgt*255).astype(np.uint8), cv2.COLOR_GRAY2BGR)
	tgt_color = cv2.applyColorMap(tgt_color, cv2.COLORMAP_JET)
	tgt_color = cv2.resize(tgt_color, (htm.shape[1]scale, htm.shape[0]scale), interpolation=cv2.INTER_NEAREST)

	lss_color = cv2.cvtColor((lss*255).astype(np.uint8), cv2.COLOR_GRAY2BGR)
	lss_color = cv2.applyColorMap(lss_color, cv2.COLORMAP_JET)
	lss_color = cv2.resize(lss_color, (htm.shape[1]scale, htm.shape[0]scale), interpolation=cv2.INTER_NEAREST)

	if scale > 2:
	tgt_color_text = tgt_color.copy()
	cv2.putText(tgt_color_text, "tgt ({:.1f}, {:.1f})".format(tgt_range[0]10, tgt_range[1]10), (10, 20), cv2.FONT_HERSHEY_SIMPLEX, 0.5, (255, 255, 255), 1)
	tgt_color = cv2.addWeighted(tgt_color, 0.6, tgt_color_text, 0.4, 0)

	htm_color_text = htm_color.copy()
	cv2.putText(htm_color_text, "htm ({:.1f}, {:.1f})".format(htm_range[0]10, htm_range[1]10), (10, 20), cv2.FONT_HERSHEY_SIMPLEX, 0.5, (255, 255, 255), 1)
	htm_color = cv2.addWeighted(htm_color, 0.6, htm_color_text, 0.4, 0)

	lss_color_text = lss_color.copy()
	cv2.putText(lss_color_text, "lss ({:.1f}, {:.1f})".format(lss_range[0]10, lss_range[1]10), (10, 20), cv2.FONT_HERSHEY_SIMPLEX, 0.5, (255, 255, 255), 1)
	lss_color = cv2.addWeighted(lss_color, 0.6, lss_color_text, 0.4, 0)

	# Get argmax of the target and draw horizontal and vertical lines
	tgt_argmax = np.unravel_index(tgt.argmax(), tgt.shape)
	tgt_color_line = tgt_color.copy()
	cv2.line(tgt_color_line, (0, tgt_argmax[0]scale), (tgt_color.shape[1], tgt_argmax[0]scale), (0, 255, 255), 1)
	cv2.line(tgt_color_line, (tgt_argmax[1]scale, 0), (tgt_argmax[1]scale, tgt_color.shape[0]), (0, 255, 255), 1)
	tgt_color = cv2.addWeighted(tgt_color, 0.6, tgt_color_line, 0.4, 0)
	htm_color_line = htm_color.copy()
	cv2.line(htm_color_line, (0, tgt_argmax[0]scale), (tgt_color.shape[1], tgt_argmax[0]scale), (0, 255, 255), 1)
	cv2.line(htm_color_line, (tgt_argmax[1]scale, 0), (tgt_argmax[1]scale, tgt_color.shape[0]), (0, 255, 255), 1)
	htm_color = cv2.addWeighted(htm_color, 0.6, htm_color_line, 0.4, 0)
	lss_color_line = lss_color.copy()
	cv2.line(lss_color_line, (0, tgt_argmax[0]scale), (tgt_color.shape[1], tgt_argmax[0]scale), (0, 255, 255), 1)
	cv2.line(lss_color_line, (tgt_argmax[1]scale, 0), (tgt_argmax[1]scale, tgt_color.shape[0]), (0, 255, 255), 1)
	lss_color = cv2.addWeighted(lss_color, 0.6, lss_color_line, 0.4, 0)

	white_column = np.ones((tgt_color.shape[0], 1, 3), dtype=np.uint8) * 255

	save_img = np.concatenate((
	tgt_color,
	white_column,
	htm_color,
	white_column,
	lss_color,
	), axis=1)

	if weight < 0.5:
	# Draw a red X across the whole save_img
	cv2.line(save_img, (0, 0), (save_img.shape[1], save_img.shape[0]), (0, 0, 255), 2)
	cv2.line(save_img, (0, save_img.shape[0]), (save_img.shape[1], 0), (0, 0, 255), 2)
	elif prob < 0.5:
	# Draw an yellow X across the whole save_img
	cv2.line(save_img, (0, 0), (save_img.shape[1], save_img.shape[0]), (0, 255, 255), 2)
	cv2.line(save_img, (0, save_img.shape[0]), (save_img.shape[1], 0), (0, 255, 255), 2)
	return save_img


	def get_pose_accuracy(self, dt, gt, mask):
	"""Calculate the accuracy of predicted pose."""
	_, avg_acc, _ = pose_pck_accuracy(
	output=to_numpy(dt),
	target=to_numpy(gt),
	mask=to_numpy(mask),
	method='argmax',
	)
	acc_pose = torch.tensor(avg_acc, device=gt.device)
	return acc_pose

	def get_binary_accuracy(self, dt, gt, mask, force_balanced=False):
	"""Calculate the binary accuracy."""
	assert dt.shape == gt.shape
	device = gt.device
	dt = to_numpy(dt)
	gt = to_numpy(gt)
	mask = to_numpy(mask)

	dt = dt[mask]
	gt = gt[mask]
	gt = gt.astype(bool)

	if force_balanced:
	# Force the number of positive and negative samples to be balanced
	pos_num = np.sum(gt)
	neg_num = len(gt) - pos_num
	num = min(pos_num, neg_num)
	if num == 0:
	return torch.tensor([0.0], device=device), torch.tensor([0.0], device=device)
	pos_idx = np.where(gt)[0]
	neg_idx = np.where(~gt)[0]

	# Randomly sample the same number of positive and negative samples
	np.random.shuffle(pos_idx)
	np.random.shuffle(neg_idx)
	idx = np.concatenate([pos_idx[:num], neg_idx[:num]])
	dt = dt[idx]
	gt = gt[idx]

	n_samples = len(gt)
	thresholds = np.arange(0.1, 1.0, 0.05)
	preds = (dt[:, None] > thresholds)
	correct = preds == gt[:, None]
	counts = correct.sum(axis=0)

	# Find the threshold that maximizes the accuracy
	best_idx = np.argmax(counts)
	best_threshold = thresholds[best_idx]
	best_acc = counts[best_idx] / n_samples

	best_acc = torch.tensor(best_acc, device=device).float()
	best_threshold = torch.tensor(best_threshold, device=device).float()
	return best_acc, best_threshold

	def get_mae(self, dt, gt, mask):
	"""Calculate the mean absolute error."""
	assert dt.shape == gt.shape
	device = gt.device
	dt = to_numpy(dt)
	gt = to_numpy(gt)
	mask = to_numpy(mask)

	dt = dt[mask]
	gt = gt[mask]
	mae = np.abs(dt - gt).mean()

	mae = torch.tensor(mae, device=device)
	return mae

	def _load_state_dict_pre_hook(self, state_dict, prefix, local_meta, *args,
	**kwargs):
	"""A hook function to convert old-version state dict of
	:class:`TopdownHeatmapSimpleHead` (before MMPose v1.0.0) to a
	compatible format of :class:`HeatmapHead`.

	The hook will be automatically registered during initialization.
	"""
	version = local_meta.get('version', None)
	if version and version >= self._version:
	return

	# convert old-version state dict
	keys = list(state_dict.keys())
	for _k in keys:
	if not _k.startswith(prefix):
	continue
	v = state_dict.pop(_k)
	k = _k[len(prefix):]
	# In old version, "final_layer" includes both intermediate
	# conv layers (new "conv_layers") and final conv layers (new
	# "final_layer").
	#
	# If there is no intermediate conv layer, old "final_layer" will
	# have keys like "final_layer.xxx", which should be still
	# named "final_layer.xxx";
	#
	# If there are intermediate conv layers, old "final_layer" will
	# have keys like "final_layer.n.xxx", where the weights of the last
	# one should be renamed "final_layer.xxx", and others should be
	# renamed "conv_layers.n.xxx"
	k_parts = k.split('.')
	if k_parts[0] == 'final_layer':
	if len(k_parts) == 3:
	assert isinstance(self.conv_layers, nn.Sequential)
	idx = int(k_parts[1])
	if idx < len(self.conv_layers):
	# final_layer.n.xxx -> conv_layers.n.xxx
	k_new = 'conv_layers.' + '.'.join(k_parts[1:])
	else:
	# final_layer.n.xxx -> final_layer.xxx
	k_new = 'final_layer.' + k_parts[2]
	else:
	# final_layer.xxx remains final_layer.xxx
	k_new = k
	else:
	k_new = k

	state_dict[prefix + k_new] = v

	def error_to_OKS(self, error, area=1.0):
	"""Convert the error to OKS."""
	sigmas = np.array(
	[.26, .25, .25, .35, .35, .79, .79, .72, .72, .62, .62, 1.07, 1.07, .87, .87, .89, .89])/10.0
	if isinstance(error, torch.Tensor):
	sigmas = torch.tensor(sigmas, device=error.device)
	vars = (sigmas * 2)**2
	norm_error = error**2 / vars / area / 2.0
	return torch.exp(-norm_error)


	def compute_oks(gt, dt, use_area=True, per_kpt=False):
	sigmas = np.array(
	[.26, .25, .25, .35, .35, .79, .79, .72, .72, .62, .62, 1.07, 1.07, .87, .87, .89, .89])/10.0
	vars = (sigmas * 2)**2
	k = len(sigmas)
	visibility_condition = lambda x: x > 0
	g = np.array(gt['keypoints']).reshape(k, 3)
	xg = g[:, 0]; yg = g[:, 1]; vg = g[:, 2]
	k1 = np.count_nonzero(visibility_condition(vg))
	bb = gt['bbox']
	x0 = bb[0] - bb[2]; x1 = bb[0] + bb[2] * 2
	y0 = bb[1] - bb[3]; y1 = bb[1] + bb[3] * 2

	d = np.array(dt['keypoints']).reshape((k, 3))
	xd = d[:, 0]; yd = d[:, 1]

	if k1>0:
	# measure the per-keypoint distance if keypoints visible
	dx = xd - xg
	dy = yd - yg

	else:
	# measure minimum distance to keypoints in (x0,y0) & (x1,y1)
	z = np.zeros((k))
	dx = np.max((z, x0-xd),axis=0)+np.max((z, xd-x1),axis=0)
	dy = np.max((z, y0-yd),axis=0)+np.max((z, yd-y1),axis=0)

	if use_area:
	e = (dx2 + dy2) / vars / (gt['area']+np.spacing(1)) / 2
	else:
	tmparea = gt['bbox'][3] * gt['bbox'][2] * 0.53
	e = (dx2 + dy2) / vars / (tmparea+np.spacing(1)) / 2

	if per_kpt:
	oks = np.exp(-e)
	if k1 > 0:
	oks[~visibility_condition(vg)] = 0

	else:
	if k1 > 0:
	e=e[visibility_condition(vg)]
	oks = np.sum(np.exp(-e)) / e.shape[0]

	return oks