Spaces:

tencent
/

HunyuanImage-2.1

Running on A100

App Files Files Community

HunyuanImage-2.1 / hyimage /models /hunyuan /modules /token_refiner.py

KevinNg99

Initial commit.

43c5292 4 days ago

raw

history blame contribute delete

10.4 kB

	from typing import Optional

	import torch
	import torch.nn as nn
	from einops import rearrange

	from hyimage.models.hunyuan.modules.flash_attn_no_pad import flash_attn_no_pad
	from .activation_layers import get_activation_layer
	from .embed_layers import TextProjection, TimestepEmbedder
	from .mlp_layers import MLP
	from .modulate_layers import apply_gate
	from .norm_layers import get_norm_layer


	@torch.compiler.disable
	def attention(
	q: torch.Tensor,
	k: torch.Tensor,
	v: torch.Tensor,
	drop_rate: float = 0.0,
	attn_mask: Optional[torch.Tensor] = None,
	causal: bool = False,
	) -> torch.Tensor:
	"""
	Compute attention using flash_attn_no_pad.

	Args:
	q: Query tensor of shape [B, L, H, D]
	k: Key tensor of shape [B, L, H, D]
	v: Value tensor of shape [B, L, H, D]
	drop_rate: Dropout rate for attention weights.
	attn_mask: Optional attention mask of shape [B, L].
	causal: Whether to apply causal masking.

	Returns:
	Output tensor after attention of shape [B, L, H*D]
	"""
	qkv = torch.stack([q, k, v], dim=2)
	if attn_mask is not None and attn_mask.dtype != torch.bool:
	attn_mask = attn_mask.bool()
	x = flash_attn_no_pad(qkv, attn_mask, causal=causal, dropout_p=drop_rate, softmax_scale=None)
	b, s, a, d = x.shape
	out = x.reshape(b, s, -1)
	return out


	class IndividualTokenRefinerBlock(nn.Module):
	"""
	A single block for token refinement with self-attention and MLP.

	Args:
	hidden_size: Hidden dimension size.
	heads_num: Number of attention heads.
	mlp_width_ratio: Expansion ratio for MLP hidden size.
	mlp_drop_rate: Dropout rate for MLP.
	act_type: Activation function type.
	qk_norm: Whether to use QK normalization.
	qk_norm_type: Type of QK normalization.
	qkv_bias: Whether to use bias in QKV projections.
	dtype: Optional torch dtype.
	device: Optional torch device.
	"""

	def __init__(
	self,
	hidden_size: int,
	heads_num: int,
	mlp_width_ratio: float = 4.0,
	mlp_drop_rate: float = 0.0,
	act_type: str = "silu",
	qk_norm: bool = False,
	qk_norm_type: str = "layer",
	qkv_bias: bool = True,
	dtype: Optional[torch.dtype] = None,
	device: Optional[torch.device] = None,
	):
	factory_kwargs = {"device": device, "dtype": dtype}
	super().__init__()
	self.heads_num = heads_num
	head_dim = hidden_size // heads_num
	mlp_hidden_dim = int(hidden_size * mlp_width_ratio)

	self.norm1 = nn.LayerNorm(hidden_size, elementwise_affine=True, eps=1e-6, **factory_kwargs)
	self.self_attn_qkv = nn.Linear(hidden_size, hidden_size * 3, bias=qkv_bias, **factory_kwargs)
	qk_norm_layer = get_norm_layer(qk_norm_type)
	self.self_attn_q_norm = (
	qk_norm_layer(head_dim, elementwise_affine=True, eps=1e-6, **factory_kwargs) if qk_norm else nn.Identity()
	)
	self.self_attn_k_norm = (
	qk_norm_layer(head_dim, elementwise_affine=True, eps=1e-6, **factory_kwargs) if qk_norm else nn.Identity()
	)
	self.self_attn_proj = nn.Linear(hidden_size, hidden_size, bias=qkv_bias, **factory_kwargs)

	self.norm2 = nn.LayerNorm(hidden_size, elementwise_affine=True, eps=1e-6, **factory_kwargs)
	act_layer = get_activation_layer(act_type)
	self.mlp = MLP(
	in_channels=hidden_size,
	hidden_channels=mlp_hidden_dim,
	act_layer=act_layer,
	drop=mlp_drop_rate,
	**factory_kwargs,
	)

	self.adaLN_modulation = nn.Sequential(
	act_layer(),
	nn.Linear(hidden_size, 2 * hidden_size, bias=True, **factory_kwargs),
	)
	# Zero-initialize the modulation
	nn.init.zeros_(self.adaLN_modulation[1].weight)
	nn.init.zeros_(self.adaLN_modulation[1].bias)

	def forward(
	self,
	x: torch.Tensor,
	c: torch.Tensor, # timestep_aware_representations + context_aware_representations
	attn_mask: Optional[torch.Tensor] = None,
	) -> torch.Tensor:
	"""
	Forward pass for IndividualTokenRefinerBlock.

	Args:
	x: Input tensor of shape [B, L, C].
	c: Conditioning tensor of shape [B, C].
	attn_mask: Optional attention mask of shape [B, L].

	Returns:
	Refined tensor of shape [B, L, C].
	"""
	gate_msa, gate_mlp = self.adaLN_modulation(c).chunk(2, dim=1)
	norm_x = self.norm1(x)
	qkv = self.self_attn_qkv(norm_x)
	q, k, v = rearrange(qkv, "B L (K H D) -> K B L H D", K=3, H=self.heads_num)
	q = self.self_attn_q_norm(q).to(v)
	k = self.self_attn_k_norm(k).to(v)
	attn = attention(q, k, v, attn_mask=attn_mask)
	x = x + apply_gate(self.self_attn_proj(attn), gate_msa)
	x = x + apply_gate(self.mlp(self.norm2(x)), gate_mlp)
	return x


	class IndividualTokenRefiner(nn.Module):
	"""
	Stacks multiple IndividualTokenRefinerBlock modules.

	Args:
	hidden_size: Hidden dimension size.
	heads_num: Number of attention heads.
	depth: Number of blocks.
	mlp_width_ratio: Expansion ratio for MLP hidden size.
	mlp_drop_rate: Dropout rate for MLP.
	act_type: Activation function type.
	qk_norm: Whether to use QK normalization.
	qk_norm_type: Type of QK normalization.
	qkv_bias: Whether to use bias in QKV projections.
	dtype: Optional torch dtype.
	device: Optional torch device.
	"""

	def __init__(
	self,
	hidden_size: int,
	heads_num: int,
	depth: int,
	mlp_width_ratio: float = 4.0,
	mlp_drop_rate: float = 0.0,
	act_type: str = "silu",
	qk_norm: bool = False,
	qk_norm_type: str = "layer",
	qkv_bias: bool = True,
	dtype: Optional[torch.dtype] = None,
	device: Optional[torch.device] = None,
	):
	factory_kwargs = {"device": device, "dtype": dtype}
	super().__init__()
	self.blocks = nn.ModuleList(
	[
	IndividualTokenRefinerBlock(
	hidden_size=hidden_size,
	heads_num=heads_num,
	mlp_width_ratio=mlp_width_ratio,
	mlp_drop_rate=mlp_drop_rate,
	act_type=act_type,
	qk_norm=qk_norm,
	qk_norm_type=qk_norm_type,
	qkv_bias=qkv_bias,
	**factory_kwargs,
	)
	for _ in range(depth)
	]
	)

	def forward(
	self,
	x: torch.Tensor,
	c: torch.LongTensor,
	mask: Optional[torch.Tensor] = None,
	) -> torch.Tensor:
	"""
	Forward pass for IndividualTokenRefiner.

	Args:
	x: Input tensor of shape [B, L, C].
	c: Conditioning tensor of shape [B, C].
	mask: Optional mask tensor of shape [B, L].

	Returns:
	Refined tensor of shape [B, L, C].
	"""
	if mask is not None:
	mask = mask.clone().bool()
	mask[:, 0] = True # Prevent attention weights from becoming NaN
	for block in self.blocks:
	x = block(x, c, mask)
	return x


	class SingleTokenRefiner(nn.Module):
	"""
	Single token refiner block for LLM text embedding refinement.

	Args:
	in_channels: Input feature dimension.
	hidden_size: Hidden dimension size.
	heads_num: Number of attention heads.
	depth: Number of blocks.
	mlp_width_ratio: Expansion ratio for MLP hidden size.
	mlp_drop_rate: Dropout rate for MLP.
	act_type: Activation function type.
	qk_norm: Whether to use QK normalization.
	qk_norm_type: Type of QK normalization.
	qkv_bias: Whether to use bias in QKV projections.
	dtype: Optional torch dtype.
	device: Optional torch device.
	"""

	def __init__(
	self,
	in_channels: int,
	hidden_size: int,
	heads_num: int,
	depth: int,
	mlp_width_ratio: float = 4.0,
	mlp_drop_rate: float = 0.0,
	act_type: str = "silu",
	qk_norm: bool = False,
	qk_norm_type: str = "layer",
	qkv_bias: bool = True,
	dtype: Optional[torch.dtype] = None,
	device: Optional[torch.device] = None,
	):
	factory_kwargs = {"device": device, "dtype": dtype}
	super().__init__()
	self.input_embedder = nn.Linear(in_channels, hidden_size, bias=True, **factory_kwargs)
	act_layer = get_activation_layer(act_type)
	self.t_embedder = TimestepEmbedder(hidden_size, act_layer, **factory_kwargs)
	self.c_embedder = TextProjection(in_channels, hidden_size, act_layer, **factory_kwargs)
	self.individual_token_refiner = IndividualTokenRefiner(
	hidden_size=hidden_size,
	heads_num=heads_num,
	depth=depth,
	mlp_width_ratio=mlp_width_ratio,
	mlp_drop_rate=mlp_drop_rate,
	act_type=act_type,
	qk_norm=qk_norm,
	qk_norm_type=qk_norm_type,
	qkv_bias=qkv_bias,
	**factory_kwargs,
	)

	def forward(
	self,
	x: torch.Tensor,
	t: torch.LongTensor,
	mask: Optional[torch.LongTensor] = None,
	) -> torch.Tensor:
	"""
	Forward pass for SingleTokenRefiner.

	Args:
	x: Input tensor of shape [B, L, in_channels].
	t: Timestep tensor of shape [B].
	mask: Optional mask tensor of shape [B, L].

	Returns:
	Refined tensor of shape [B, L, hidden_size].
	"""
	timestep_aware_representations = self.t_embedder(t)
	if mask is None:
	context_aware_representations = x.mean(dim=1)
	else:
	mask_float = mask.unsqueeze(-1) # [B, L, 1]
	context_aware_representations = (x * mask_float).sum(dim=1) / mask_float.sum(dim=1)
	context_aware_representations = self.c_embedder(context_aware_representations)
	c = timestep_aware_representations + context_aware_representations
	x = self.input_embedder(x)
	x = self.individual_token_refiner(x, c, mask)
	return x