Spaces:

purkrmir
/

BBoxMaskPose-demo

Running on Zero

BBoxMaskPose-demo / mmpose /models /necks /hybrid_encoder.py

Miroslav Purkrabek

add code

a249588 26 days ago

11.4 kB

	# Copyright (c) OpenMMLab. All rights reserved.
	from typing import List, Optional, Tuple

	import torch
	import torch.nn as nn
	import torch.nn.functional as F
	from mmcv.cnn import ConvModule
	from mmengine.model import BaseModule, ModuleList
	from torch import Tensor

	from mmpose.models.utils import (DetrTransformerEncoder, RepVGGBlock,
	SinePositionalEncoding)
	from mmpose.registry import MODELS
	from mmpose.utils.typing import ConfigType, OptConfigType


	class CSPRepLayer(BaseModule):
	"""CSPRepLayer, a layer that combines Cross Stage Partial Networks with
	RepVGG Blocks.

	Args:
	in_channels (int): Number of input channels to the layer.
	out_channels (int): Number of output channels from the layer.
	num_blocks (int): The number of RepVGG blocks to be used in the layer.
	Defaults to 3.
	widen_factor (float): Expansion factor for intermediate channels.
	Determines the hidden channel size based on out_channels.
	Defaults to 1.0.
	norm_cfg (dict): Configuration for normalization layers.
	Defaults to Batch Normalization with trainable parameters.
	act_cfg (dict): Configuration for activation layers.
	Defaults to SiLU (Swish) with in-place operation.
	"""

	def __init__(self,
	in_channels: int,
	out_channels: int,
	num_blocks: int = 3,
	widen_factor: float = 1.0,
	norm_cfg: OptConfigType = dict(type='BN', requires_grad=True),
	act_cfg: OptConfigType = dict(type='SiLU', inplace=True)):
	super(CSPRepLayer, self).__init__()
	hidden_channels = int(out_channels * widen_factor)
	self.conv1 = ConvModule(
	in_channels,
	hidden_channels,
	kernel_size=1,
	norm_cfg=norm_cfg,
	act_cfg=act_cfg)
	self.conv2 = ConvModule(
	in_channels,
	hidden_channels,
	kernel_size=1,
	norm_cfg=norm_cfg,
	act_cfg=act_cfg)

	self.bottlenecks = nn.Sequential(*[
	RepVGGBlock(hidden_channels, hidden_channels, act_cfg=act_cfg)
	for _ in range(num_blocks)
	])
	if hidden_channels != out_channels:
	self.conv3 = ConvModule(
	hidden_channels,
	out_channels,
	kernel_size=1,
	norm_cfg=norm_cfg,
	act_cfg=act_cfg)
	else:
	self.conv3 = nn.Identity()

	def forward(self, x: Tensor) -> Tensor:
	"""Forward function.

	Args:
	x (Tensor): The input tensor.

	Returns:
	Tensor: The output tensor.
	"""
	x_1 = self.conv1(x)
	x_1 = self.bottlenecks(x_1)
	x_2 = self.conv2(x)
	return self.conv3(x_1 + x_2)


	@MODELS.register_module()
	class HybridEncoder(BaseModule):
	"""Hybrid encoder neck introduced in `RT-DETR` by Lyu et al (2023),
	combining transformer encoders with a Feature Pyramid Network (FPN) and a
	Path Aggregation Network (PAN).

	Args:
	encoder_cfg (ConfigType): Configuration for the transformer encoder.
	projector (OptConfigType, optional): Configuration for an optional
	projector module. Defaults to None.
	num_encoder_layers (int, optional): Number of encoder layers.
	Defaults to 1.
	in_channels (List[int], optional): Input channels of feature maps.
	Defaults to [512, 1024, 2048].
	feat_strides (List[int], optional): Strides of feature maps.
	Defaults to [8, 16, 32].
	hidden_dim (int, optional): Hidden dimension of the MLP.
	Defaults to 256.
	use_encoder_idx (List[int], optional): Indices of encoder layers to
	use. Defaults to [2].
	pe_temperature (int, optional): Positional encoding temperature.
	Defaults to 10000.
	widen_factor (float, optional): Expansion factor for CSPRepLayer.
	Defaults to 1.0.
	deepen_factor (float, optional): Depth multiplier for CSPRepLayer.
	Defaults to 1.0.
	spe_learnable (bool, optional): Whether positional encoding is
	learnable. Defaults to False.
	output_indices (Optional[List[int]], optional): Indices of output
	layers. Defaults to None.
	norm_cfg (OptConfigType, optional): Configuration for normalization
	layers. Defaults to Batch Normalization.
	act_cfg (OptConfigType, optional): Configuration for activation
	layers. Defaults to SiLU (Swish) with in-place operation.

	.. _`RT-DETR`: https://arxiv.org/abs/2304.08069
	"""

	def __init__(self,
	encoder_cfg: ConfigType = dict(),
	projector: OptConfigType = None,
	num_encoder_layers: int = 1,
	in_channels: List[int] = [512, 1024, 2048],
	feat_strides: List[int] = [8, 16, 32],
	hidden_dim: int = 256,
	use_encoder_idx: List[int] = [2],
	pe_temperature: int = 10000,
	widen_factor: float = 1.0,
	deepen_factor: float = 1.0,
	spe_learnable: bool = False,
	output_indices: Optional[List[int]] = None,
	norm_cfg: OptConfigType = dict(type='BN', requires_grad=True),
	act_cfg: OptConfigType = dict(type='SiLU', inplace=True)):
	super(HybridEncoder, self).__init__()
	self.in_channels = in_channels
	self.feat_strides = feat_strides
	self.hidden_dim = hidden_dim
	self.use_encoder_idx = use_encoder_idx
	self.num_encoder_layers = num_encoder_layers
	self.pe_temperature = pe_temperature
	self.output_indices = output_indices

	# channel projection
	self.input_proj = ModuleList()
	for in_channel in in_channels:
	self.input_proj.append(
	ConvModule(
	in_channel,
	hidden_dim,
	kernel_size=1,
	padding=0,
	norm_cfg=norm_cfg,
	act_cfg=None))

	# encoder transformer
	if len(use_encoder_idx) > 0:
	pos_enc_dim = self.hidden_dim // 2
	self.encoder = ModuleList([
	DetrTransformerEncoder(num_encoder_layers, encoder_cfg)
	for _ in range(len(use_encoder_idx))
	])

	self.sincos_pos_enc = SinePositionalEncoding(
	pos_enc_dim,
	learnable=spe_learnable,
	temperature=self.pe_temperature,
	spatial_dim=2)

	# top-down fpn
	lateral_convs = list()
	fpn_blocks = list()
	for idx in range(len(in_channels) - 1, 0, -1):
	lateral_convs.append(
	ConvModule(
	hidden_dim,
	hidden_dim,
	1,
	1,
	norm_cfg=norm_cfg,
	act_cfg=act_cfg))
	fpn_blocks.append(
	CSPRepLayer(
	hidden_dim * 2,
	hidden_dim,
	round(3 * deepen_factor),
	act_cfg=act_cfg,
	widen_factor=widen_factor))
	self.lateral_convs = ModuleList(lateral_convs)
	self.fpn_blocks = ModuleList(fpn_blocks)

	# bottom-up pan
	downsample_convs = list()
	pan_blocks = list()
	for idx in range(len(in_channels) - 1):
	downsample_convs.append(
	ConvModule(
	hidden_dim,
	hidden_dim,
	3,
	stride=2,
	padding=1,
	norm_cfg=norm_cfg,
	act_cfg=act_cfg))
	pan_blocks.append(
	CSPRepLayer(
	hidden_dim * 2,
	hidden_dim,
	round(3 * deepen_factor),
	act_cfg=act_cfg,
	widen_factor=widen_factor))
	self.downsample_convs = ModuleList(downsample_convs)
	self.pan_blocks = ModuleList(pan_blocks)

	if projector is not None:
	self.projector = MODELS.build(projector)
	else:
	self.projector = None

	def forward(self, inputs: Tuple[Tensor]) -> Tuple[Tensor]:
	"""Forward function."""
	assert len(inputs) == len(self.in_channels)

	proj_feats = [
	self.input_proj[i](inputs[i]) for i in range(len(inputs))
	]
	# encoder
	if self.num_encoder_layers > 0:
	for i, enc_ind in enumerate(self.use_encoder_idx):
	h, w = proj_feats[enc_ind].shape[2:]
	# flatten [B, C, H, W] to [B, HxW, C]
	src_flatten = proj_feats[enc_ind].flatten(2).permute(
	0, 2, 1).contiguous()

	if torch.onnx.is_in_onnx_export():
	pos_enc = getattr(self, f'pos_enc_{i}')
	else:
	pos_enc = self.sincos_pos_enc(size=(h, w))
	pos_enc = pos_enc.transpose(-1, -2).reshape(1, h * w, -1)
	memory = self.encoder[i](
	src_flatten, query_pos=pos_enc, key_padding_mask=None)

	proj_feats[enc_ind] = memory.permute(
	0, 2, 1).contiguous().view([-1, self.hidden_dim, h, w])

	# top-down fpn
	inner_outs = [proj_feats[-1]]
	for idx in range(len(self.in_channels) - 1, 0, -1):
	feat_high = inner_outs[0]
	feat_low = proj_feats[idx - 1]
	feat_high = self.lateral_convs[len(self.in_channels) - 1 - idx](
	feat_high)
	inner_outs[0] = feat_high

	upsample_feat = F.interpolate(
	feat_high, scale_factor=2., mode='nearest')
	inner_out = self.fpn_blocks[len(self.in_channels) - 1 - idx](
	torch.cat([upsample_feat, feat_low], axis=1))
	inner_outs.insert(0, inner_out)

	# bottom-up pan
	outs = [inner_outs[0]]
	for idx in range(len(self.in_channels) - 1):
	feat_low = outs[-1]
	feat_high = inner_outs[idx + 1]
	downsample_feat = self.downsample_convs[idx](feat_low) # Conv
	out = self.pan_blocks[idx]( # CSPRepLayer
	torch.cat([downsample_feat, feat_high], axis=1))
	outs.append(out)

	if self.output_indices is not None:
	outs = [outs[i] for i in self.output_indices]

	if self.projector is not None:
	outs = self.projector(outs)

	return tuple(outs)

	def switch_to_deploy(self, test_cfg):
	"""Switch to deploy mode."""

	if getattr(self, 'deploy', False):
	return

	if self.num_encoder_layers > 0:
	for i, enc_ind in enumerate(self.use_encoder_idx):
	h, w = test_cfg['input_size']
	h = int(h / 2**(3 + enc_ind))
	w = int(w / 2**(3 + enc_ind))
	pos_enc = self.sincos_pos_enc(size=(h, w))
	pos_enc = pos_enc.transpose(-1, -2).reshape(1, h * w, -1)
	self.register_buffer(f'pos_enc_{i}', pos_enc)

	self.deploy = True