Spaces:
Build error
Build error
from typing import List, Tuple, Optional, Union, Dict | |
from einops import rearrange | |
import torch, os | |
import torch.nn as nn | |
import torch.nn.functional as F | |
from diffusers.models import ModelMixin | |
from diffusers.configuration_utils import ConfigMixin, register_to_config | |
from flash_attn.flash_attn_interface import flash_attn_varlen_func | |
from .activation_layers import get_activation_layer | |
from .norm_layers import get_norm_layer | |
from .embed_layers import TimestepEmbedder, PatchEmbed, TextProjection | |
from .attn_layers import apply_rotary_emb | |
from .mlp_layers import MLP, MLPEmbedder, FinalLayer | |
from .modulate_layers import ModulateDiT, modulate, apply_gate | |
from .token_refiner import SingleTokenRefiner | |
from .audio_adapters import AudioProjNet2, PerceiverAttentionCA | |
from .parallel_states import ( | |
nccl_info, | |
get_cu_seqlens, | |
get_sequence_parallel_state, | |
parallel_attention, | |
all_gather, | |
) | |
CPU_OFFLOAD = int(os.environ.get("CPU_OFFLOAD", 0)) | |
DISABLE_SP = int(os.environ.get("DISABLE_SP", 0)) | |
print(f'models: cpu_offload={CPU_OFFLOAD}, DISABLE_SP={DISABLE_SP}') | |
class DoubleStreamBlock(nn.Module): | |
def __init__( | |
self, | |
hidden_size: int, | |
num_heads: int, | |
mlp_width_ratio: float, | |
mlp_act_type: str = 'gelu_tanh', | |
qk_norm: bool = True, | |
qk_norm_type: str = 'rms', | |
qkv_bias: bool = False, | |
dtype: Optional[torch.dtype] = None, | |
device: Optional[torch.device] = None, | |
): | |
factory_kwargs = {'device': device, 'dtype': dtype} | |
super().__init__() | |
self.deterministic = False | |
self.num_heads = num_heads | |
head_dim = hidden_size // num_heads | |
mlp_hidden_dim = int(hidden_size * mlp_width_ratio) | |
self.img_mod = ModulateDiT(hidden_size, factor=6, act_layer=get_activation_layer("silu"), **factory_kwargs) | |
self.img_norm1 = nn.LayerNorm(hidden_size, elementwise_affine=False, eps=1e-6, **factory_kwargs) | |
self.img_attn_qkv = nn.Linear(hidden_size, hidden_size * 3, bias=qkv_bias, **factory_kwargs) | |
qk_norm_layer = get_norm_layer(qk_norm_type) | |
self.img_attn_q_norm = ( | |
qk_norm_layer(head_dim, elementwise_affine=True, eps=1e-6, **factory_kwargs) | |
if qk_norm | |
else nn.Identity() | |
) | |
self.img_attn_k_norm = ( | |
qk_norm_layer(head_dim, elementwise_affine=True, eps=1e-6, **factory_kwargs) | |
if qk_norm | |
else nn.Identity() | |
) | |
self.img_attn_proj = nn.Linear(hidden_size, hidden_size, bias=qkv_bias, **factory_kwargs) | |
self.img_norm2 = nn.LayerNorm(hidden_size, elementwise_affine=False, eps=1e-6, **factory_kwargs) | |
self.img_mlp = MLP( | |
hidden_size, | |
mlp_hidden_dim, | |
act_layer=get_activation_layer(mlp_act_type), | |
bias=True, | |
**factory_kwargs | |
) | |
self.txt_mod = ModulateDiT(hidden_size, factor=6, act_layer=get_activation_layer("silu"), **factory_kwargs) | |
self.txt_norm1 = nn.LayerNorm(hidden_size, elementwise_affine=False, eps=1e-6, **factory_kwargs) | |
self.txt_attn_qkv = nn.Linear(hidden_size, hidden_size * 3, bias=qkv_bias, **factory_kwargs) | |
qk_norm_layer = get_norm_layer(qk_norm_type) | |
self.txt_attn_q_norm = ( | |
qk_norm_layer(head_dim, elementwise_affine=True, eps=1e-6, **factory_kwargs) | |
if qk_norm | |
else nn.Identity() | |
) | |
self.txt_attn_k_norm = ( | |
qk_norm_layer(head_dim, elementwise_affine=True, eps=1e-6, **factory_kwargs) | |
if qk_norm | |
else nn.Identity() | |
) | |
self.txt_attn_proj = nn.Linear(hidden_size, hidden_size, bias=qkv_bias, **factory_kwargs) | |
self.txt_norm2 = nn.LayerNorm(hidden_size, elementwise_affine=False, eps=1e-6, **factory_kwargs) | |
self.txt_mlp = MLP( | |
hidden_size, | |
mlp_hidden_dim, | |
act_layer=get_activation_layer(mlp_act_type), | |
bias=True, | |
**factory_kwargs | |
) | |
def enable_deterministic(self): | |
self.deterministic = True | |
def disable_deterministic(self): | |
self.deterministic = False | |
def forward( | |
self, | |
img: torch.Tensor, | |
txt: torch.Tensor, | |
vec: torch.Tensor, | |
cu_seqlens_q: Optional[torch.Tensor] = None, | |
cu_seqlens_kv: Optional[torch.Tensor] = None, | |
max_seqlen_q: Optional[int] = None, | |
max_seqlen_kv: Optional[int] = None, | |
freqs_cis: tuple = None | |
) -> Tuple[torch.Tensor, torch.Tensor]: | |
img_mod1_shift, img_mod1_scale, img_mod1_gate, img_mod2_shift, img_mod2_scale, img_mod2_gate = ( | |
self.img_mod(vec).chunk(6, dim=-1) | |
) | |
txt_mod1_shift, txt_mod1_scale, txt_mod1_gate, txt_mod2_shift, txt_mod2_scale, txt_mod2_gate = ( | |
self.txt_mod(vec).chunk(6, dim=-1) | |
) | |
if CPU_OFFLOAD: torch.cuda.empty_cache() | |
# Prepare image for attention. | |
img_modulated = self.img_norm1(img) | |
img_modulated = modulate(img_modulated, shift=img_mod1_shift, scale=img_mod1_scale) | |
img_qkv = self.img_attn_qkv(img_modulated) | |
if CPU_OFFLOAD: torch.cuda.empty_cache() | |
img_q, img_k, img_v = rearrange(img_qkv, "B L (K H D) -> K B L H D", K=3, H=self.num_heads) | |
# Apply QK-Norm if needed | |
img_q = self.img_attn_q_norm(img_q).to(img_v) | |
img_k = self.img_attn_k_norm(img_k).to(img_v) | |
if CPU_OFFLOAD: torch.cuda.empty_cache() | |
# Apply RoPE if needed. | |
if freqs_cis is not None: | |
img_qq, img_kk = apply_rotary_emb(img_q, img_k, freqs_cis, head_first=False) | |
assert img_qq.shape == img_q.shape and img_kk.shape == img_k.shape, \ | |
f'img_kk: {img_qq.shape}, img_q: {img_q.shape}, img_kk: {img_kk.shape}, img_k: {img_k.shape}' | |
img_q, img_k = img_qq, img_kk | |
# Prepare txt for attention. | |
txt_modulated = self.txt_norm1(txt) | |
txt_modulated = modulate(txt_modulated, shift=txt_mod1_shift, scale=txt_mod1_scale) | |
if CPU_OFFLOAD: torch.cuda.empty_cache() | |
txt_qkv = self.txt_attn_qkv(txt_modulated) | |
txt_q, txt_k, txt_v = rearrange(txt_qkv, "B L (K H D) -> K B L H D", K=3, H=self.num_heads) | |
# Apply QK-Norm if needed. | |
txt_q = self.txt_attn_q_norm(txt_q).to(txt_v) | |
txt_k = self.txt_attn_k_norm(txt_k).to(txt_v) | |
if CPU_OFFLOAD: torch.cuda.empty_cache() | |
# Run actual attention. | |
q = torch.cat((img_q, txt_q), dim=1) | |
k = torch.cat((img_k, txt_k), dim=1) | |
v = torch.cat((img_v, txt_v), dim=1) | |
# Compute attention. | |
if CPU_OFFLOAD or DISABLE_SP: | |
assert cu_seqlens_q.shape[0] == 2 * img.shape[0] + 1 | |
q, k, v = [ | |
x.view(x.shape[0] * x.shape[1], *x.shape[2:]) | |
for x in [q, k, v] | |
] | |
attn = flash_attn_varlen_func( | |
q, | |
k, | |
v, | |
cu_seqlens_q, | |
cu_seqlens_kv, | |
max_seqlen_q, | |
max_seqlen_kv, | |
) | |
attn = attn.view(img_k.shape[0], max_seqlen_q, -1).contiguous() | |
else: | |
attn, _ = parallel_attention( | |
(img_q, txt_q), | |
(img_k, txt_k), | |
(img_v, txt_v), | |
img_q_len=img_q.shape[1], | |
img_kv_len=img_k.shape[1], | |
cu_seqlens_q=cu_seqlens_q, | |
cu_seqlens_kv=cu_seqlens_kv, | |
max_seqlen_q=max_seqlen_q, | |
max_seqlen_kv=max_seqlen_kv, | |
) | |
img_attn, txt_attn = attn[:, :img.shape[1]], attn[:, img.shape[1]:] | |
if CPU_OFFLOAD: torch.cuda.empty_cache() | |
# Calculate the img bloks. | |
img = img + apply_gate(self.img_attn_proj(img_attn), gate=img_mod1_gate) | |
img = img + apply_gate(self.img_mlp(modulate(self.img_norm2(img), shift=img_mod2_shift, scale=img_mod2_scale)), gate=img_mod2_gate) | |
if CPU_OFFLOAD: torch.cuda.empty_cache() | |
# Calculate the txt bloks. | |
txt = txt + apply_gate(self.txt_attn_proj(txt_attn), gate=txt_mod1_gate) | |
txt = txt + apply_gate(self.txt_mlp(modulate(self.txt_norm2(txt), shift=txt_mod2_shift, scale=txt_mod2_scale)), gate=txt_mod2_gate) | |
if CPU_OFFLOAD: torch.cuda.empty_cache() | |
return img, txt | |
class SingleStreamBlock(nn.Module): | |
""" | |
A DiT block with parallel linear layers as described in | |
https://arxiv.org/abs/2302.05442 and adapted modulation interface. | |
""" | |
def __init__( | |
self, | |
hidden_size: int, | |
num_heads: int, | |
mlp_width_ratio: float = 4.0, | |
mlp_act_type: str = 'gelu_tanh', | |
qk_norm: bool = True, | |
qk_norm_type: str = 'rms', | |
qk_scale: float = None, | |
dtype: Optional[torch.dtype] = None, | |
device: Optional[torch.device] = None, | |
): | |
factory_kwargs = {'device': device, 'dtype': dtype} | |
super().__init__() | |
self.deterministic = False | |
self.hidden_size = hidden_size | |
self.num_heads = num_heads | |
head_dim = hidden_size // num_heads | |
mlp_hidden_dim = int(hidden_size * mlp_width_ratio) | |
self.mlp_hidden_dim = mlp_hidden_dim | |
self.scale = qk_scale or head_dim**-0.5 | |
# qkv and mlp_in | |
self.linear1 = nn.Linear(hidden_size, hidden_size * 3 + mlp_hidden_dim, **factory_kwargs) | |
# proj and mlp_out | |
self.linear2 = nn.Linear(hidden_size + mlp_hidden_dim, hidden_size, **factory_kwargs) | |
qk_norm_layer = get_norm_layer(qk_norm_type) | |
self.q_norm = ( | |
qk_norm_layer(head_dim, elementwise_affine=True, eps=1e-6, **factory_kwargs) | |
if qk_norm | |
else nn.Identity() | |
) | |
self.k_norm = ( | |
qk_norm_layer(head_dim, elementwise_affine=True, eps=1e-6, **factory_kwargs) | |
if qk_norm | |
else nn.Identity() | |
) | |
self.pre_norm = nn.LayerNorm(hidden_size, elementwise_affine=False, eps=1e-6, **factory_kwargs) | |
self.mlp_act = get_activation_layer(mlp_act_type)() | |
self.modulation = ModulateDiT(hidden_size, factor=3, act_layer=get_activation_layer("silu"), **factory_kwargs) | |
def enable_deterministic(self): | |
self.deterministic = True | |
def disable_deterministic(self): | |
self.deterministic = False | |
def forward( | |
self, | |
x: torch.Tensor, | |
vec: torch.Tensor, | |
txt_len: int, | |
cu_seqlens_q: Optional[torch.Tensor] = None, | |
cu_seqlens_kv: Optional[torch.Tensor] = None, | |
max_seqlen_q: Optional[int] = None, | |
max_seqlen_kv: Optional[int] = None, | |
freqs_cis: Tuple[torch.Tensor, torch.Tensor] = None, | |
) -> torch.Tensor: | |
mod_shift, mod_scale, mod_gate = ( | |
self.modulation(vec).chunk(3, dim=-1) | |
) | |
x_mod = modulate(self.pre_norm(x), shift=mod_shift, scale=mod_scale) | |
if CPU_OFFLOAD: torch.cuda.empty_cache() | |
qkv, mlp = torch.split(self.linear1(x_mod), [3 * self.hidden_size, self.mlp_hidden_dim], dim=-1) | |
q, k, v = rearrange(qkv, "B L (K H D) -> K B L H D", K=3, H=self.num_heads) | |
if CPU_OFFLOAD: torch.cuda.empty_cache() | |
# Apply QK-Norm if needed. | |
q = self.q_norm(q).to(v) | |
k = self.k_norm(k).to(v) | |
if CPU_OFFLOAD: torch.cuda.empty_cache() | |
# Apply RoPE if needed. | |
if freqs_cis is not None: | |
img_q, txt_q = q[:, :-txt_len, :, :], q[:, -txt_len:, :, :] | |
img_k, txt_k = k[:, :-txt_len, :, :], k[:, -txt_len:, :, :] | |
img_qq, img_kk = apply_rotary_emb(img_q, img_k, freqs_cis, head_first=False) | |
assert img_qq.shape == img_q.shape and img_kk.shape == img_k.shape, \ | |
f'img_kk: {img_qq.shape}, img_q: {img_q.shape}, img_kk: {img_kk.shape}, img_k: {img_k.shape}' | |
img_q, img_k = img_qq, img_kk | |
q = torch.cat((img_q, txt_q), dim=1) | |
k = torch.cat((img_k, txt_k), dim=1) | |
if CPU_OFFLOAD: torch.cuda.empty_cache() | |
# Compute attention. | |
if CPU_OFFLOAD or DISABLE_SP: | |
assert cu_seqlens_q.shape[0] == 2 * x.shape[0] + 1, f"cu_seqlens_q.shape:{cu_seqlens_q.shape}, x.shape[0]:{x.shape[0]}" | |
# [b, s+l, a, d] -> [s+l, b, a, d] | |
q, k, v = [ | |
x.view(x.shape[0] * x.shape[1], *x.shape[2:]) | |
for x in [q, k, v] | |
] | |
attn = flash_attn_varlen_func( | |
q, | |
k, | |
v, | |
cu_seqlens_q, | |
cu_seqlens_kv, | |
max_seqlen_q, | |
max_seqlen_kv, | |
) | |
attn = attn.view(x.shape[0], max_seqlen_q, -1).contiguous() | |
else: | |
img_v, txt_v = v[:, :-txt_len, :, :], v[:, -txt_len:, :, :] | |
attn, _ = parallel_attention( | |
(img_q, txt_q), | |
(img_k, txt_k), | |
(img_v, txt_v), | |
img_q_len=img_q.shape[1], | |
img_kv_len=img_k.shape[1], | |
cu_seqlens_q=cu_seqlens_q, | |
cu_seqlens_kv=cu_seqlens_kv, | |
max_seqlen_q=max_seqlen_q, | |
max_seqlen_kv=max_seqlen_kv, | |
) | |
if CPU_OFFLOAD: | |
torch.cuda.empty_cache() | |
tmp = torch.cat((attn, self.mlp_act(mlp)), 2) | |
torch.cuda.empty_cache() | |
output = self.linear2(tmp) | |
else: | |
output = self.linear2(torch.cat((attn, self.mlp_act(mlp)), 2)) | |
return x + apply_gate(output, gate=mod_gate) | |
class HYVideoDiffusionTransformer(ModelMixin, ConfigMixin): | |
""" | |
HunyuanVideo Transformer backbone | |
Inherited from ModelMixin and ConfigMixin for compatibility with diffusers' sampler StableDiffusionPipeline. | |
Reference: | |
[1] Flux.1: https://github.com/black-forest-labs/flux | |
[2] MMDiT: http://arxiv.org/abs/2403.03206, | |
https://github.com/huggingface/diffusers/blob/main/src/diffusers/pipelines/stable_diffusion_3/pipeline_stable_diffusion_3.py | |
""" | |
def __init__( | |
self, | |
args, | |
patch_size: list = [1,2,2], | |
in_channels: int = 4, # Should be VAE.config.latent_channels. | |
out_channels: int = None, | |
hidden_size: int = 3072, | |
mlp_width_ratio: float = 4.0, | |
mlp_act_type: str = 'gelu_tanh', | |
num_heads: int = 24, | |
depth_double_blocks: int = 19, | |
depth_single_blocks: int = 38, | |
rope_dim_list: List[int] = [16, 56, 56], | |
qkv_bias: bool = True, | |
qk_norm: bool = True, | |
qk_norm_type: str = 'rms', | |
guidance_embed: bool = False, # For modulation. | |
dtype: Optional[torch.dtype] = None, | |
device: Optional[torch.device] = None, | |
): | |
factory_kwargs = {'device': device, 'dtype': dtype} | |
super().__init__() | |
# Text projection. Default to linear projection. | |
# Alternative: TokenRefiner. See more details (LI-DiT): http://arxiv.org/abs/2406.11831 | |
self.text_projection = args.text_projection | |
self.text_states_dim = args.text_states_dim | |
self.use_attention_mask = args.use_attention_mask | |
self.text_states_dim_2 = args.text_states_dim_2 | |
# Now we only use above configs from args. | |
self.patch_size = patch_size | |
self.in_channels = in_channels | |
self.out_channels = in_channels if out_channels is None else out_channels | |
self.unpatchify_channels = self.out_channels | |
self.guidance_embed = guidance_embed | |
self.rope_dim_list = rope_dim_list | |
if hidden_size % num_heads != 0: | |
raise ValueError( | |
f"Hidden size {hidden_size} must be divisible by num_heads {num_heads}" | |
) | |
pe_dim = hidden_size // num_heads | |
if sum(rope_dim_list) != pe_dim: | |
raise ValueError(f"Got {rope_dim_list} but expected positional dim {pe_dim}") | |
self.hidden_size = hidden_size | |
self.num_heads = num_heads | |
# image projection | |
self.img_in = PatchEmbed( | |
self.patch_size, self.in_channels, self.hidden_size, **factory_kwargs | |
) | |
self.ref_in = PatchEmbed( | |
self.patch_size, self.in_channels, self.hidden_size, **factory_kwargs | |
) | |
# text projection | |
if self.text_projection == "linear": | |
self.txt_in = TextProjection( | |
self.text_states_dim, | |
self.hidden_size, | |
get_activation_layer("silu"), | |
**factory_kwargs | |
) | |
elif self.text_projection == "single_refiner": | |
self.txt_in = SingleTokenRefiner( | |
self.text_states_dim, hidden_size, num_heads, depth=2, **factory_kwargs | |
) | |
else: | |
raise NotImplementedError(f"Unsupported text_projection: {self.text_projection}") | |
# time modulation | |
self.time_in = TimestepEmbedder( | |
self.hidden_size, get_activation_layer("silu"), **factory_kwargs | |
) | |
# text modulation | |
self.vector_in = MLPEmbedder( | |
self.text_states_dim_2, self.hidden_size, **factory_kwargs | |
) | |
# guidance modulation | |
self.guidance_in = TimestepEmbedder( | |
self.hidden_size, get_activation_layer("silu"), **factory_kwargs | |
) if guidance_embed else None | |
# double blocks | |
self.double_blocks = nn.ModuleList( | |
[ | |
DoubleStreamBlock( | |
self.hidden_size, | |
self.num_heads, | |
mlp_width_ratio=mlp_width_ratio, | |
mlp_act_type=mlp_act_type, | |
qk_norm=qk_norm, | |
qk_norm_type=qk_norm_type, | |
qkv_bias=qkv_bias, | |
**factory_kwargs | |
) | |
for _ in range(depth_double_blocks) | |
] | |
) | |
# single blocks | |
self.single_blocks = nn.ModuleList( | |
[ | |
SingleStreamBlock( | |
self.hidden_size, | |
self.num_heads, | |
mlp_width_ratio=mlp_width_ratio, | |
mlp_act_type=mlp_act_type, | |
qk_norm=qk_norm, | |
qk_norm_type=qk_norm_type, | |
**factory_kwargs | |
) | |
for _ in range(depth_single_blocks) | |
] | |
) | |
self.final_layer = FinalLayer( | |
self.hidden_size, | |
self.patch_size, | |
self.out_channels, | |
get_activation_layer("silu"), | |
**factory_kwargs | |
) | |
# -------------------- audio_proj_model -------------------- | |
self.audio_proj = AudioProjNet2(seq_len=10, blocks=5, channels=384, intermediate_dim=1024, output_dim=3072, context_tokens=4) | |
# -------------------- motion-embeder -------------------- | |
self.motion_exp = TimestepEmbedder( | |
self.hidden_size // 4, | |
get_activation_layer("silu"), | |
**factory_kwargs | |
) | |
self.motion_pose = TimestepEmbedder( | |
self.hidden_size // 4, | |
get_activation_layer("silu"), | |
**factory_kwargs | |
) | |
self.fps_proj = TimestepEmbedder( | |
self.hidden_size, | |
get_activation_layer("silu"), | |
**factory_kwargs | |
) | |
self.before_proj = nn.Linear(self.hidden_size, self.hidden_size) | |
# -------------------- audio_insert_model -------------------- | |
self.double_stream_list = [1, 3, 5, 7, 9, 11, 13, 15, 17, 19] | |
self.single_stream_list = [] | |
self.double_stream_map = {str(i): j for j, i in enumerate(self.double_stream_list)} | |
self.single_stream_map = {str(i): j+len(self.double_stream_list) for j, i in enumerate(self.single_stream_list)} | |
self.audio_adapter_blocks = nn.ModuleList([ | |
PerceiverAttentionCA(dim=3072, dim_head=1024, heads=33) for _ in range(len(self.double_stream_list) + len(self.single_stream_list)) | |
]) | |
def enable_deterministic(self): | |
for block in self.double_blocks: | |
block.enable_deterministic() | |
for block in self.single_blocks: | |
block.enable_deterministic() | |
def disable_deterministic(self): | |
for block in self.double_blocks: | |
block.disable_deterministic() | |
for block in self.single_blocks: | |
block.disable_deterministic() | |
def forward( | |
self, | |
x: torch.Tensor, | |
t: torch.Tensor, # Should be in range(0, 1000). | |
ref_latents: torch.Tensor=None, | |
text_states: torch.Tensor = None, | |
text_mask: torch.Tensor = None, # Now we don't use it. | |
text_states_2: Optional[torch.Tensor] = None, # Text embedding for modulation. | |
freqs_cos: Optional[torch.Tensor] = None, | |
freqs_sin: Optional[torch.Tensor] = None, | |
guidance: torch.Tensor = None, # Guidance for modulation, should be cfg_scale x 1000. | |
return_dict: bool = True, | |
is_cache: bool = False, | |
**additional_kwargs, | |
) -> Union[torch.Tensor, Dict[str, torch.Tensor]]: | |
out = {} | |
img = x | |
txt = text_states | |
bsz, _, ot, oh, ow = x.shape | |
tt, th, tw = ot // self.patch_size[0], oh // self.patch_size[1], ow // self.patch_size[2] | |
# Prepare modulation vectors. | |
vec = self.time_in(t) | |
motion_exp_vec = self.motion_exp(additional_kwargs["motion_exp"].view(-1)).view(x.shape[0], -1) # (b, 3072) | |
vec = vec + motion_exp_vec | |
motion_pose_vec = self.motion_pose(additional_kwargs["motion_pose"].view(-1)).view(x.shape[0], -1) # (b, 3072) | |
vec = vec + motion_pose_vec | |
fps_vec = self.fps_proj(additional_kwargs["fps"]) # (b, 3072) | |
vec = vec + fps_vec | |
audio_feature_all = self.audio_proj(additional_kwargs["audio_prompts"]) | |
# text modulation | |
vec = vec + self.vector_in(text_states_2) | |
# guidance modulation | |
if self.guidance_embed: | |
if guidance is None: | |
raise ValueError("Didn't get guidance strength for guidance distilled model.") | |
else: | |
# our timestep_embedding is merged into guidance_in(TimestepEmbedder) | |
vec = vec + self.guidance_in(guidance) | |
if CPU_OFFLOAD: torch.cuda.empty_cache() | |
# Embed image and text. | |
ref_latents_first = ref_latents[:, :, :1].clone() | |
img, shape_mask = self.img_in(img) | |
ref_latents,_ = self.ref_in(ref_latents) | |
ref_latents_first,_ = self.img_in(ref_latents_first) | |
if self.text_projection == "linear": | |
txt = self.txt_in(txt) | |
elif self.text_projection == "single_refiner": | |
# [b, l, h] | |
txt = self.txt_in(txt, t, text_mask if self.use_attention_mask else None) | |
else: | |
raise NotImplementedError(f"Unsupported text_projection: {self.text_projection}") | |
img = self.before_proj(ref_latents) + img | |
if CPU_OFFLOAD: torch.cuda.empty_cache() | |
ref_length = ref_latents_first.shape[-2] # [b s c] | |
img = torch.cat([ref_latents_first, img], dim=-2) # t c | |
img_len = img.shape[1] | |
mask_len = img_len - ref_length | |
if additional_kwargs["face_mask"].shape[2] == 1: | |
face_mask = additional_kwargs["face_mask"].repeat(1,1,ot,1,1) # repeat if number of mask frame is 1 | |
else: | |
face_mask = additional_kwargs["face_mask"] | |
face_mask = torch.nn.functional.interpolate(face_mask, size=[ot, shape_mask[-2], shape_mask[-1]], mode="nearest") | |
face_mask = face_mask.view(-1,mask_len,1).repeat(1,1,img.shape[-1]).type_as(img) | |
txt_seq_len = txt.shape[1] | |
img_seq_len = img.shape[1] | |
cu_seqlens_q = get_cu_seqlens(text_mask, img_seq_len) | |
cu_seqlens_kv = cu_seqlens_q | |
max_seqlen_q = img_seq_len + txt_seq_len | |
max_seqlen_kv = max_seqlen_q | |
if get_sequence_parallel_state(): | |
sp_size = nccl_info.sp_size | |
sp_rank = nccl_info.rank_within_group | |
assert img.shape[1] % sp_size == 0, f"Cannot split video sequence into ulysses SP ({sp_size}) parts evenly" | |
img = torch.chunk(img, sp_size, dim=1)[sp_rank] | |
freqs_cos = torch.chunk(freqs_cos, sp_size, dim=0)[sp_rank] | |
freqs_sin = torch.chunk(freqs_sin, sp_size, dim=0)[sp_rank] | |
if CPU_OFFLOAD: torch.cuda.empty_cache() | |
freqs_cis = (freqs_cos, freqs_sin) if freqs_cos is not None else None | |
# --------------------- Pass through DiT blocks ------------------------ | |
if not is_cache: | |
for layer_num, block in enumerate(self.double_blocks): | |
double_block_args = [img, txt, vec, cu_seqlens_q, cu_seqlens_kv, max_seqlen_q, max_seqlen_kv, freqs_cis] | |
img, txt = block(*double_block_args) | |
if CPU_OFFLOAD: torch.cuda.empty_cache() | |
""" insert audio feature to img """ | |
if layer_num in self.double_stream_list: | |
if get_sequence_parallel_state(): | |
img = all_gather(img, dim=1) | |
real_img = img[:,ref_length:].clone().view(bsz, ot, -1, 3072) | |
real_ref_img = torch.zeros_like(img[:,:ref_length].clone()) | |
audio_feature_pad = audio_feature_all[:,:1].repeat(1,3,1,1) | |
audio_feature_all_insert = torch.cat([audio_feature_pad, audio_feature_all], dim=1).view(bsz, ot, 16, 3072) | |
double_idx = self.double_stream_map[str(layer_num)] | |
real_img = self.audio_adapter_blocks[double_idx](audio_feature_all_insert, real_img).view(bsz, -1, 3072) | |
img = img + torch.cat((real_ref_img, real_img * face_mask), dim=1) | |
if get_sequence_parallel_state(): | |
sp_size = nccl_info.sp_size | |
sp_rank = nccl_info.rank_within_group | |
assert img.shape[1] % sp_size == 0, f"Cannot split video sequence into ulysses SP ({sp_size}) parts evenly" | |
img = torch.chunk(img, sp_size, dim=1)[sp_rank] | |
# Merge txt and img to pass through single stream blocks. | |
x = torch.cat((img, txt), 1) | |
# Compatible with MMDiT. | |
if len(self.single_blocks) > 0: | |
for layer_num, block in enumerate(self.single_blocks): | |
if layer_num == (len(self.single_blocks) - 1): | |
# self.cache_out = x | |
tmp = x[:, :-txt_seq_len, ...] | |
if get_sequence_parallel_state(): | |
tmp = all_gather(tmp, dim=1) | |
self.cache_out = torch.cat([tmp, x[:, -txt_seq_len:, ...]], dim=1) | |
single_block_args = [x, vec, txt_seq_len, cu_seqlens_q, cu_seqlens_kv, max_seqlen_q, max_seqlen_kv, (freqs_cos, freqs_sin)] | |
x = block(*single_block_args) | |
if CPU_OFFLOAD: torch.cuda.empty_cache() | |
else: | |
if get_sequence_parallel_state(): | |
sp_size = nccl_info.sp_size | |
sp_rank = nccl_info.rank_within_group | |
tmp, txt = self.cache_out[:, :-txt_seq_len], self.cache_out[:, -txt_seq_len:] | |
tmp = torch.chunk(tmp, sp_size, dim=1)[sp_rank] | |
x = torch.cat([tmp, txt], dim=1) | |
else: | |
x = self.cache_out | |
if len(self.single_blocks) > 0: | |
for layer_num, block in enumerate(self.single_blocks): | |
if layer_num < (len(self.single_blocks) - 1): | |
continue | |
single_block_args = [x, vec, txt_seq_len, cu_seqlens_q, cu_seqlens_kv, max_seqlen_q, max_seqlen_kv, (freqs_cos, freqs_sin)] | |
x = block(*single_block_args) | |
if CPU_OFFLOAD: torch.cuda.empty_cache() | |
img = x[:, :-txt_seq_len, ...] | |
if get_sequence_parallel_state(): | |
img = all_gather(img, dim=1) | |
img = img[:, ref_length:] | |
# ---------------------------- Final layer ------------------------------ | |
img = self.final_layer(img, vec) # (N, T, patch_size ** 2 * out_channels) | |
img = self.unpatchify(img, tt, th, tw) | |
if return_dict: | |
out['x'] = img | |
return out | |
return img | |
def unpatchify(self, x, t, h, w): | |
""" | |
x: (N, T, patch_size**2 * C) | |
imgs: (N, H, W, C) | |
""" | |
c = self.unpatchify_channels | |
pt, ph, pw = self.patch_size | |
assert t * h * w == x.shape[1] | |
x = x.reshape(shape=(x.shape[0], t, h, w, c, pt, ph, pw)) | |
x = torch.einsum('nthwcopq->nctohpwq', x) | |
imgs = x.reshape(shape=(x.shape[0], c, t * pt, h * ph, w * pw)) | |
return imgs | |
def params_count(self): | |
counts = { | |
"double": sum([ | |
sum(p.numel() for p in block.img_attn_qkv.parameters()) + | |
sum(p.numel() for p in block.img_attn_proj.parameters()) + | |
sum(p.numel() for p in block.img_mlp.parameters()) + | |
sum(p.numel() for p in block.txt_attn_qkv.parameters()) + | |
sum(p.numel() for p in block.txt_attn_proj.parameters()) + | |
sum(p.numel() for p in block.txt_mlp.parameters()) | |
for block in self.double_blocks | |
]), | |
"single": sum([ | |
sum(p.numel() for p in block.linear1.parameters()) + | |
sum(p.numel() for p in block.linear2.parameters()) | |
for block in self.single_blocks | |
]), | |
"total": sum(p.numel() for p in self.parameters()), | |
} | |
counts["attn+mlp"] = counts["double"] + counts["single"] | |
return counts | |
################################################################################# | |
# HunyuanVideo Configs # | |
################################################################################# | |
HUNYUAN_VIDEO_CONFIG = { # Attn+MLP / Total | |
'HYVideo-T/2': { # 9.0B / 12.5B | |
'depth_double_blocks': 20, | |
'depth_single_blocks': 40, | |
'rope_dim_list': [16, 56, 56], | |
'hidden_size': 3072, | |
'num_heads': 24, | |
'mlp_width_ratio': 4, | |
}, | |
} | |