Spaces:

rahul7star
/

PusaV1

Paused

App Files Files Community

PusaV1 / src /genmo /pusa /vae /models.py

rahul7star

Migrated from GitHub

96257b2 verified about 1 month ago

raw

history blame contribute delete

42.1 kB

	from functools import partial
	from typing import Any, Callable, Dict, List, Optional, Tuple, Union

	import torch
	import torch.nn as nn
	import torch.nn.functional as F
	from einops import rearrange
	import torch.distributed as dist

	import genmo.mochi_preview.dit.joint_model.context_parallel as cp
	from genmo.lib.progress import get_new_progress_bar
	from genmo.mochi_preview.vae.cp_conv import cp_pass_frames, gather_all_frames
	from genmo.mochi_preview.vae.latent_dist import LatentDistribution
	import genmo.mochi_preview.vae.cp_conv as cp_conv


	def cast_tuple(t, length=1):
	return t if isinstance(t, tuple) else ((t,) * length)


	class GroupNormSpatial(nn.GroupNorm):
	"""
	GroupNorm applied per-frame.
	"""

	def forward(self, x: torch.Tensor, *, chunk_size: int = 8):
	B, C, T, H, W = x.shape
	x = rearrange(x, "B C T H W -> (B T) C H W")
	# Run group norm in chunks.
	output = torch.empty_like(x)
	for b in range(0, B * T, chunk_size):
	output[b : b + chunk_size] = super().forward(x[b : b + chunk_size])
	return rearrange(output, "(B T) C H W -> B C T H W", B=B, T=T)


	class SafeConv3d(torch.nn.Conv3d):
	"""
	NOTE: No support for padding along time dimension.
	Input must already be padded along time.
	"""

	def forward(self, input):
	memory_count = torch.prod(torch.tensor(input.shape)).item() * 2 / 1024**3
	if memory_count > 2 and self.stride[0] == 1:
	part_num = int(memory_count / 2) + 1
	k = self.kernel_size[0]
	input_idx = torch.arange(k - 1, input.size(2))
	input_chunks_idx = torch.chunk(input_idx, part_num, dim=0)

	# assert self.stride[0] == 1, f"stride {self.stride}"
	assert self.dilation[0] == 1, f"dilation {self.dilation}"
	assert self.padding[0] == 0, f"padding {self.padding}"

	# Comptue output size
	assert not input.requires_grad
	B, _, T_in, H_in, W_in = input.shape
	output_size = (
	B,
	self.out_channels,
	T_in - k + 1,
	H_in // self.stride[1],
	W_in // self.stride[2],
	)
	output = torch.empty(output_size, dtype=input.dtype, device=input.device)
	for input_chunk_idx in input_chunks_idx:
	input_s = input_chunk_idx[0] - k + 1
	input_e = input_chunk_idx[-1] + 1
	input_chunk = input[:, :, input_s:input_e, :, :]
	output_chunk = super(SafeConv3d, self).forward(input_chunk)

	output_s = input_s
	output_e = output_s + output_chunk.size(2)
	output[:, :, output_s:output_e, :, :] = output_chunk

	return output
	else:
	return super(SafeConv3d, self).forward(input)


	class StridedSafeConv3d(torch.nn.Conv3d):
	def forward(self, input, local_shard: bool = False):
	assert self.stride[0] == self.kernel_size[0]
	assert self.dilation[0] == 1
	assert self.padding[0] == 0

	kernel_size = self.kernel_size[0]
	stride = self.stride[0]
	T_in = input.size(2)
	T_out = T_in // kernel_size

	# Parallel implementation.
	if local_shard:
	idx = torch.arange(T_out)
	idx = cp.local_shard(idx, dim=0)
	start = idx.min() * stride
	end = idx.max() * stride + kernel_size
	local_input = input[:, :, start:end, :, :]
	return torch.nn.Conv3d.forward(self, local_input)

	raise NotImplementedError


	class ContextParallelConv3d(SafeConv3d):
	def __init__(
	self,
	in_channels,
	out_channels,
	kernel_size: Union[int, Tuple[int, int, int]],
	stride: Union[int, Tuple[int, int, int]],
	causal: bool = True,
	context_parallel: bool = True,
	**kwargs,
	):
	self.causal = causal
	self.context_parallel = context_parallel
	kernel_size = cast_tuple(kernel_size, 3)
	stride = cast_tuple(stride, 3)
	height_pad = (kernel_size[1] - 1) // 2
	width_pad = (kernel_size[2] - 1) // 2

	super().__init__(
	in_channels=in_channels,
	out_channels=out_channels,
	kernel_size=kernel_size,
	stride=stride,
	dilation=(1, 1, 1),
	padding=(0, height_pad, width_pad),
	**kwargs,
	)

	def forward(self, x: torch.Tensor):
	cp_rank, cp_world_size = cp.get_cp_rank_size()

	# Compute padding amounts.
	context_size = self.kernel_size[0] - 1
	if self.causal:
	pad_front = context_size
	pad_back = 0
	else:
	pad_front = context_size // 2
	pad_back = context_size - pad_front

	# Apply padding.
	mode = "constant" if self.padding_mode == "zeros" else self.padding_mode
	if self.context_parallel and cp_world_size == 1:
	x = F.pad(x, (0, 0, 0, 0, pad_front, pad_back), mode=mode)
	else:
	if cp_rank == 0:
	x = F.pad(x, (0, 0, 0, 0, pad_front, 0), mode=mode)
	elif cp_rank == cp_world_size - 1 and pad_back:
	x = F.pad(x, (0, 0, 0, 0, 0, pad_back), mode=mode)

	if self.context_parallel and cp_world_size == 1:
	return super().forward(x)

	if self.stride[0] == 1:
	# Receive some frames from previous rank.
	x = cp_pass_frames(x, context_size)
	return super().forward(x)

	# Less efficient implementation for strided convs.
	# All gather x, infer and chunk.
	assert x.dtype == torch.bfloat16, f"Expected x to be of type torch.bfloat16, got {x.dtype}"
	# if cp_rank == 0:
	# print(f"ContextParallelConv3d: cp_rank: {cp_rank}, cp_world_size: {cp_world_size}, x.shape: {x.shape}, context_size: {context_size},self.kernel_size: {self.kernel_size}, self.stride: {self.stride}")
	# """print:
	# ContextParallelConv3d: cp_rank: 0, cp_world_size: 8, x.shape: torch.Size([1, 128, 22, 240, 424]), context_size: 1,self.kernel_size: (2, 2, 2), self.stride: (2, 2, 2)
	# ContextParallelConv3d: cp_rank: 0, cp_world_size: 8, x.shape: torch.Size([1, 256, 13, 120, 212]), context_size: 2,self.kernel_size: (3, 2, 2), self.stride: (3, 2, 2)
	# """

	x = gather_all_frames(x) # [B, C, k - 1 + global_T, H, W]
	# if cp_rank == 0:
	# print(f"gather_all_frames: cp_rank: {cp_rank}, cp_world_size: {cp_world_size}, x.shape: {x.shape}, context_size: {context_size},self.kernel_size: {self.kernel_size}, self.stride: {self.stride}")
	# """print:
	# gather_all_frames: cp_rank: 0, cp_world_size: 8, x.shape: torch.Size([1, 128, 164, 240, 424]), context_size: 1,self.kernel_size: (2, 2, 2), self.stride: (2, 2, 2)
	# gather_all_frames: cp_rank: 0, cp_world_size: 8, x.shape: torch.Size([1, 256, 84, 120, 212]), context_size: 2,self.kernel_size: (3, 2, 2), self.stride: (3, 2, 2)
	# """

	return StridedSafeConv3d.forward(self, x, local_shard=True)


	class Conv1x1(nn.Linear):
	"""*1x1 Conv implemented with a linear layer."""

	def __init__(self, in_features: int, out_features: int, args, *kwargs):
	super().__init__(in_features, out_features, args, *kwargs)

	def forward(self, x: torch.Tensor):
	"""Forward pass.

	Args:
	x: Input tensor. Shape: [B, C, ] or [B, , C].

	Returns:
	x: Output tensor. Shape: [B, C', ] or [B, , C'].
	"""
	x = x.movedim(1, -1)
	x = super().forward(x)
	x = x.movedim(-1, 1)
	return x


	class DepthToSpaceTime(nn.Module):
	def __init__(
	self,
	temporal_expansion: int,
	spatial_expansion: int,
	):
	super().__init__()
	self.temporal_expansion = temporal_expansion
	self.spatial_expansion = spatial_expansion

	# When printed, this module should show the temporal and spatial expansion factors.
	def extra_repr(self):
	return f"texp={self.temporal_expansion}, sexp={self.spatial_expansion}"

	def forward(self, x: torch.Tensor):
	"""Forward pass.

	Args:
	x: Input tensor. Shape: [B, C, T, H, W].

	Returns:
	x: Rearranged tensor. Shape: [B, C/(stss), Tst, Hs, W*s].
	"""
	x = rearrange(
	x,
	"B (C st sh sw) T H W -> B C (T st) (H sh) (W sw)",
	st=self.temporal_expansion,
	sh=self.spatial_expansion,
	sw=self.spatial_expansion,
	)

	cp_rank, _ = cp.get_cp_rank_size()
	if self.temporal_expansion > 1 and cp_rank == 0:
	# Drop the first self.temporal_expansion - 1 frames.
	# This is because we always want the 3x3x3 conv filter to only apply
	# to the first frame, and the first frame doesn't need to be repeated.
	assert all(x.shape)
	x = x[:, :, self.temporal_expansion - 1 :]
	assert all(x.shape)

	return x


	def norm_fn(
	in_channels: int,
	affine: bool = True,
	):
	return GroupNormSpatial(affine=affine, num_groups=32, num_channels=in_channels)


	class ResBlock(nn.Module):
	"""Residual block that preserves the spatial dimensions."""

	def __init__(
	self,
	channels: int,
	*,
	affine: bool = True,
	attn_block: Optional[nn.Module] = None,
	causal: bool = True,
	prune_bottleneck: bool = False,
	padding_mode: str,
	bias: bool = True,
	):
	super().__init__()
	self.channels = channels

	assert causal
	self.stack = nn.Sequential(
	norm_fn(channels, affine=affine),
	nn.SiLU(inplace=True),
	ContextParallelConv3d(
	in_channels=channels,
	out_channels=channels // 2 if prune_bottleneck else channels,
	kernel_size=(3, 3, 3),
	stride=(1, 1, 1),
	padding_mode=padding_mode,
	bias=bias,
	causal=causal,
	),
	norm_fn(channels, affine=affine),
	nn.SiLU(inplace=True),
	ContextParallelConv3d(
	in_channels=channels // 2 if prune_bottleneck else channels,
	out_channels=channels,
	kernel_size=(3, 3, 3),
	stride=(1, 1, 1),
	padding_mode=padding_mode,
	bias=bias,
	causal=causal,
	),
	)

	self.attn_block = attn_block if attn_block else nn.Identity()

	def forward(self, x: torch.Tensor):
	"""Forward pass.

	Args:
	x: Input tensor. Shape: [B, C, T, H, W].
	"""
	residual = x
	x = self.stack(x)
	x = x + residual
	del residual

	return self.attn_block(x)


	def prepare_for_attention(qkv: torch.Tensor, head_dim: int, qk_norm: bool = True):
	"""Prepare qkv tensor for attention and normalize qk.

	Args:
	qkv: Input tensor. Shape: [B, L, 3 * num_heads * head_dim].

	Returns:
	q, k, v: qkv tensor split into q, k, v. Shape: [B, num_heads, L, head_dim].
	"""
	assert qkv.ndim == 3 # [B, L, 3 * num_heads * head_dim]
	assert qkv.size(2) % (3 * head_dim) == 0
	num_heads = qkv.size(2) // (3 * head_dim)
	qkv = qkv.unflatten(2, (3, num_heads, head_dim))

	q, k, v = qkv.unbind(2) # [B, L, num_heads, head_dim]
	q = q.transpose(1, 2) # [B, num_heads, L, head_dim]
	k = k.transpose(1, 2) # [B, num_heads, L, head_dim]
	v = v.transpose(1, 2) # [B, num_heads, L, head_dim]

	if qk_norm:
	q = F.normalize(q, p=2, dim=-1)
	k = F.normalize(k, p=2, dim=-1)

	# Mixed precision can change the dtype of normed q/k to float32.
	q = q.to(dtype=qkv.dtype)
	k = k.to(dtype=qkv.dtype)

	return q, k, v


	class Attention(nn.Module):
	def __init__(
	self,
	dim: int,
	head_dim: int = 32,
	qkv_bias: bool = False,
	out_bias: bool = True,
	qk_norm: bool = True,
	) -> None:
	super().__init__()
	self.head_dim = head_dim
	self.num_heads = dim // head_dim
	self.qk_norm = qk_norm

	self.qkv = nn.Linear(dim, 3 * dim, bias=qkv_bias)
	self.out = nn.Linear(dim, dim, bias=out_bias)

	def forward(
	self,
	x: torch.Tensor,
	*,
	chunk_size=2**15,
	) -> torch.Tensor:
	"""Compute temporal self-attention.

	Args:
	x: Input tensor. Shape: [B, C, T, H, W].
	chunk_size: Chunk size for large tensors.

	Returns:
	x: Output tensor. Shape: [B, C, T, H, W].
	"""
	B, _, T, H, W = x.shape

	if T == 1:
	# No attention for single frame.
	x = x.movedim(1, -1) # [B, C, T, H, W] -> [B, T, H, W, C]
	qkv = self.qkv(x)
	_, _, x = qkv.chunk(3, dim=-1) # Throw away queries and keys.
	x = self.out(x)
	return x.movedim(-1, 1) # [B, T, H, W, C] -> [B, C, T, H, W]

	# 1D temporal attention.
	x = rearrange(x, "B C t h w -> (B h w) t C")
	qkv = self.qkv(x)

	# Input: qkv with shape [B, t, 3 * num_heads * head_dim]
	# Output: x with shape [B, num_heads, t, head_dim]
	q, k, v = prepare_for_attention(qkv, self.head_dim, qk_norm=self.qk_norm)

	attn_kwargs = dict(
	attn_mask=None,
	dropout_p=0.0,
	is_causal=True,
	scale=self.head_dim**-0.5,
	)

	if q.size(0) <= chunk_size:
	x = F.scaled_dot_product_attention(q, k, v, **attn_kwargs) # [B, num_heads, t, head_dim]
	else:
	# Evaluate in chunks to avoid `RuntimeError: CUDA error: invalid configuration argument.`
	# Chunks of 2**16 and up cause an error.
	x = torch.empty_like(q)
	for i in range(0, q.size(0), chunk_size):
	qc = q[i : i + chunk_size]
	kc = k[i : i + chunk_size]
	vc = v[i : i + chunk_size]
	chunk = F.scaled_dot_product_attention(qc, kc, vc, **attn_kwargs)
	x[i : i + chunk_size].copy_(chunk)

	assert x.size(0) == q.size(0)
	x = x.transpose(1, 2) # [B, t, num_heads, head_dim]
	x = x.flatten(2) # [B, t, num_heads * head_dim]

	x = self.out(x)
	x = rearrange(x, "(B h w) t C -> B C t h w", B=B, h=H, w=W)
	return x


	class AttentionBlock(nn.Module):
	def __init__(
	self,
	dim: int,
	**attn_kwargs,
	) -> None:
	super().__init__()
	self.norm = norm_fn(dim)
	self.attn = Attention(dim, **attn_kwargs)

	def forward(self, x: torch.Tensor) -> torch.Tensor:
	return x + self.attn(self.norm(x))


	class CausalUpsampleBlock(nn.Module):
	def __init__(
	self,
	in_channels: int,
	out_channels: int,
	num_res_blocks: int,
	*,
	temporal_expansion: int = 2,
	spatial_expansion: int = 2,
	**block_kwargs,
	):
	super().__init__()

	blocks = []
	for _ in range(num_res_blocks):
	blocks.append(block_fn(in_channels, **block_kwargs))
	self.blocks = nn.Sequential(*blocks)

	self.temporal_expansion = temporal_expansion
	self.spatial_expansion = spatial_expansion

	# Change channels in the final convolution layer.
	self.proj = Conv1x1(
	in_channels,
	out_channels * temporal_expansion * (spatial_expansion**2),
	)

	self.d2st = DepthToSpaceTime(temporal_expansion=temporal_expansion, spatial_expansion=spatial_expansion)

	def forward(self, x):
	x = self.blocks(x)
	x = self.proj(x)
	x = self.d2st(x)
	return x


	def block_fn(channels, , affine: bool = True, has_attention: bool = False, *block_kwargs):
	attn_block = AttentionBlock(channels) if has_attention else None
	return ResBlock(channels, affine=affine, attn_block=attn_block, **block_kwargs)


	def add_fourier_features(inputs: torch.Tensor, start=6, stop=8, step=1):
	num_freqs = (stop - start) // step
	assert inputs.ndim == 5
	C = inputs.size(1)

	# Create Base 2 Fourier features.
	freqs = torch.arange(start, stop, step, dtype=inputs.dtype, device=inputs.device)
	assert num_freqs == len(freqs)
	w = torch.pow(2.0, freqs) * (2 * torch.pi) # [num_freqs]
	C = inputs.shape[1]
	w = w.repeat(C)[None, :, None, None, None] # [1, C * num_freqs, 1, 1, 1]

	# Interleaved repeat of input channels to match w.
	h = inputs.repeat_interleave(num_freqs, dim=1) # [B, C * num_freqs, T, H, W]
	# Scale channels by frequency.
	h = w * h

	return torch.cat(
	[
	inputs,
	torch.sin(h),
	torch.cos(h),
	],
	dim=1,
	)


	class FourierFeatures(nn.Module):
	def __init__(self, start: int = 6, stop: int = 8, step: int = 1):
	super().__init__()
	self.start = start
	self.stop = stop
	self.step = step

	def forward(self, inputs):
	"""Add Fourier features to inputs.

	Args:
	inputs: Input tensor. Shape: [B, C, T, H, W]

	Returns:
	h: Output tensor. Shape: [B, (1 + 2 * num_freqs) * C, T, H, W]
	"""
	return add_fourier_features(inputs, self.start, self.stop, self.step)


	class Decoder(nn.Module):
	def __init__(
	self,
	*,
	out_channels: int = 3,
	latent_dim: int,
	base_channels: int,
	channel_multipliers: List[int],
	num_res_blocks: List[int],
	temporal_expansions: Optional[List[int]] = None,
	spatial_expansions: Optional[List[int]] = None,
	has_attention: List[bool],
	output_norm: bool = True,
	nonlinearity: str = "silu",
	output_nonlinearity: str = "silu",
	causal: bool = True,
	**block_kwargs,
	):
	super().__init__()
	self.input_channels = latent_dim
	self.base_channels = base_channels
	self.channel_multipliers = channel_multipliers
	self.num_res_blocks = num_res_blocks
	self.output_nonlinearity = output_nonlinearity
	assert nonlinearity == "silu"
	assert causal

	ch = [mult * base_channels for mult in channel_multipliers]
	self.num_up_blocks = len(ch) - 1
	assert len(num_res_blocks) == self.num_up_blocks + 2

	blocks = []
	new_block_fn = partial(block_fn, padding_mode="replicate")

	first_block = [nn.Conv3d(latent_dim, ch[-1], kernel_size=(1, 1, 1))] # Input layer.
	# First set of blocks preserve channel count.
	for _ in range(num_res_blocks[-1]):
	first_block.append(
	new_block_fn(
	ch[-1],
	has_attention=has_attention[-1],
	causal=causal,
	**block_kwargs,
	)
	)
	blocks.append(nn.Sequential(*first_block))

	assert len(temporal_expansions) == len(spatial_expansions) == self.num_up_blocks
	assert len(num_res_blocks) == len(has_attention) == self.num_up_blocks + 2

	upsample_block_fn = CausalUpsampleBlock

	for i in range(self.num_up_blocks):
	block = upsample_block_fn(
	ch[-i - 1],
	ch[-i - 2],
	num_res_blocks=num_res_blocks[-i - 2],
	has_attention=has_attention[-i - 2],
	temporal_expansion=temporal_expansions[-i - 1],
	spatial_expansion=spatial_expansions[-i - 1],
	causal=causal,
	padding_mode="replicate",
	**block_kwargs,
	)
	blocks.append(block)

	assert not output_norm

	# Last block. Preserve channel count.
	last_block = []
	for _ in range(num_res_blocks[0]):
	last_block.append(new_block_fn(ch[0], has_attention=has_attention[0], causal=causal, **block_kwargs))
	blocks.append(nn.Sequential(*last_block))

	self.blocks = nn.ModuleList(blocks)
	self.output_proj = Conv1x1(ch[0], out_channels)

	def forward(self, x):
	"""Forward pass.

	Args:
	x: Latent tensor. Shape: [B, input_channels, t, h, w]. Scaled [-1, 1].

	Returns:
	x: Reconstructed video tensor. Shape: [B, C, T, H, W]. Scaled to [-1, 1].
	T + 1 = (t - 1) * 4.
	H = h * 16, W = w * 16.
	"""
	for block in self.blocks:
	x = block(x)

	if self.output_nonlinearity == "silu":
	x = F.silu(x, inplace=not self.training)
	else:
	assert not self.output_nonlinearity # StyleGAN3 omits the to-RGB nonlinearity.

	return self.output_proj(x).contiguous()


	def make_broadcastable(
	tensor: torch.Tensor,
	axis: int,
	ndim: int,
	) -> torch.Tensor:
	"""
	Reshapes the input tensor to have singleton dimensions in all axes except the specified axis.

	Args:
	tensor (torch.Tensor): The tensor to reshape. Typically 1D.
	axis (int): The axis along which the tensor should retain its original size.
	ndim (int): The total number of dimensions the reshaped tensor should have.

	Returns:
	torch.Tensor: The reshaped tensor with shape suitable for broadcasting.
	"""
	if tensor.dim() != 1:
	raise ValueError(f"Expected tensor to be 1D, but got {tensor.dim()}D tensor.")

	axis = (axis + ndim) % ndim # Ensure the axis is within the tensor dimensions
	shape = [1] * ndim # Start with all dimensions as 1
	shape[axis] = tensor.size(0) # Set the specified axis to the size of the tensor
	return tensor.view(*shape)


	def blend(a: torch.Tensor, b: torch.Tensor, axis: int) -> torch.Tensor:
	"""
	Blends two tensors `a` and `b` along the specified axis using linear interpolation.

	Args:
	a (torch.Tensor): The first tensor.
	b (torch.Tensor): The second tensor. Must have the same shape as `a`.
	axis (int): The axis along which to perform the blending.

	Returns:
	torch.Tensor: The blended tensor.
	"""
	assert a.shape == b.shape, f"Tensors must have the same shape, got {a.shape} and {b.shape}"
	steps = a.size(axis)

	# Create a weight tensor that linearly interpolates from 0 to 1
	start = 1 / (steps + 1)
	end = steps / (steps + 1)
	weight = torch.linspace(start, end, steps=steps, device=a.device, dtype=a.dtype)

	# Make the weight tensor broadcastable across all dimensions
	weight = make_broadcastable(weight, axis, a.dim())

	# Perform the blending
	return a * (1 - weight) + b * weight


	def blend_horizontal(a: torch.Tensor, b: torch.Tensor, overlap: int) -> torch.Tensor:
	if overlap == 0:
	return torch.cat([a, b], dim=-1)

	assert a.size(-1) >= overlap
	assert b.size(-1) >= overlap
	a_left, a_overlap = a[..., :-overlap], a[..., -overlap:]
	b_overlap, b_right = b[..., :overlap], b[..., overlap:]
	return torch.cat([a_left, blend(a_overlap, b_overlap, -1), b_right], dim=-1)


	def blend_vertical(a: torch.Tensor, b: torch.Tensor, overlap: int) -> torch.Tensor:
	if overlap == 0:
	return torch.cat([a, b], dim=-2)

	assert a.size(-2) >= overlap
	assert b.size(-2) >= overlap
	a_top, a_overlap = a[..., :-overlap, :], a[..., -overlap:, :]
	b_overlap, b_bottom = b[..., :overlap, :], b[..., overlap:, :]
	return torch.cat([a_top, blend(a_overlap, b_overlap, -2), b_bottom], dim=-2)


	def nearest_multiple(x: int, multiple: int) -> int:
	return round(x / multiple) * multiple


	def apply_tiled(
	fn: Callable[[torch.Tensor], torch.Tensor],
	x: torch.Tensor,
	num_tiles_w: int,
	num_tiles_h: int,
	overlap: int = 0, # Number of pixel of overlap between adjacent tiles.
	# Use a factor of 2 times the latent downsample factor.
	min_block_size: int = 1, # Minimum number of pixels in each dimension when subdividing.
	) -> Optional[torch.Tensor]:
	if num_tiles_w == 1 and num_tiles_h == 1:
	return fn(x)

	assert num_tiles_w & (num_tiles_w - 1) == 0, f"num_tiles_w={num_tiles_w} must be a power of 2"
	assert num_tiles_h & (num_tiles_h - 1) == 0, f"num_tiles_h={num_tiles_h} must be a power of 2"

	H, W = x.shape[-2:]
	assert H % min_block_size == 0
	assert W % min_block_size == 0
	ov = overlap // 2
	assert ov % min_block_size == 0

	if num_tiles_w >= 2:
	# Subdivide horizontally.
	half_W = nearest_multiple(W // 2, min_block_size)
	left = x[..., :, : half_W + ov]
	right = x[..., :, half_W - ov :]

	assert num_tiles_w % 2 == 0, f"num_tiles_w={num_tiles_w} must be even"
	left = apply_tiled(fn, left, num_tiles_w // 2, num_tiles_h, overlap, min_block_size)
	right = apply_tiled(fn, right, num_tiles_w // 2, num_tiles_h, overlap, min_block_size)
	if left is None or right is None:
	return None

	# If `fn` changed the resolution, adjust the overlap.
	resample_factor = left.size(-1) / (half_W + ov)
	out_overlap = int(overlap * resample_factor)

	return blend_horizontal(left, right, out_overlap)

	if num_tiles_h >= 2:
	# Subdivide vertically.
	half_H = nearest_multiple(H // 2, min_block_size)
	top = x[..., : half_H + ov, :]
	bottom = x[..., half_H - ov :, :]

	assert num_tiles_h % 2 == 0, f"num_tiles_h={num_tiles_h} must be even"
	top = apply_tiled(fn, top, num_tiles_w, num_tiles_h // 2, overlap, min_block_size)
	bottom = apply_tiled(fn, bottom, num_tiles_w, num_tiles_h // 2, overlap, min_block_size)
	if top is None or bottom is None:
	return None

	# If `fn` changed the resolution, adjust the overlap.
	resample_factor = top.size(-2) / (half_H + ov)
	out_overlap = int(overlap * resample_factor)

	return blend_vertical(top, bottom, out_overlap)

	raise ValueError(f"Invalid num_tiles_w={num_tiles_w} and num_tiles_h={num_tiles_h}")


	class DownsampleBlock(nn.Module):
	def __init__(
	self,
	in_channels: int,
	out_channels: int,
	num_res_blocks,
	*,
	temporal_reduction=2,
	spatial_reduction=2,
	**block_kwargs,
	):
	"""
	Downsample block for the VAE encoder.

	Args:
	in_channels: Number of input channels.
	out_channels: Number of output channels.
	num_res_blocks: Number of residual blocks.
	temporal_reduction: Temporal reduction factor.
	spatial_reduction: Spatial reduction factor.
	"""
	super().__init__()
	layers = []

	assert in_channels != out_channels
	layers.append(
	ContextParallelConv3d(
	in_channels=in_channels,
	out_channels=out_channels,
	kernel_size=(temporal_reduction, spatial_reduction, spatial_reduction),
	stride=(temporal_reduction, spatial_reduction, spatial_reduction),
	# First layer in each block always uses replicate padding
	padding_mode="replicate",
	bias=block_kwargs["bias"],
	)
	)

	for _ in range(num_res_blocks):
	layers.append(block_fn(out_channels, **block_kwargs))

	self.layers = nn.Sequential(*layers)

	def forward(self, x):
	return self.layers(x)


	class Encoder(nn.Module):
	def __init__(
	self,
	*,
	in_channels: int,
	base_channels: int,
	channel_multipliers: List[int],
	num_res_blocks: List[int],
	latent_dim: int,
	temporal_reductions: List[int],
	spatial_reductions: List[int],
	prune_bottlenecks: List[bool],
	has_attentions: List[bool],
	affine: bool = True,
	bias: bool = True,
	input_is_conv_1x1: bool = False,
	padding_mode: str,
	):
	super().__init__()
	self.temporal_reductions = temporal_reductions
	self.spatial_reductions = spatial_reductions
	self.base_channels = base_channels
	self.channel_multipliers = channel_multipliers
	self.num_res_blocks = num_res_blocks
	self.latent_dim = latent_dim

	ch = [mult * base_channels for mult in channel_multipliers]
	num_down_blocks = len(ch) - 1
	assert len(num_res_blocks) == num_down_blocks + 2

	layers = (
	[nn.Conv3d(in_channels, ch[0], kernel_size=(1, 1, 1), bias=True)]
	if not input_is_conv_1x1
	else [Conv1x1(in_channels, ch[0])]
	)

	assert len(prune_bottlenecks) == num_down_blocks + 2
	assert len(has_attentions) == num_down_blocks + 2
	block = partial(block_fn, padding_mode=padding_mode, affine=affine, bias=bias)

	for _ in range(num_res_blocks[0]):
	layers.append(block(ch[0], has_attention=has_attentions[0], prune_bottleneck=prune_bottlenecks[0]))
	prune_bottlenecks = prune_bottlenecks[1:]
	has_attentions = has_attentions[1:]

	assert len(temporal_reductions) == len(spatial_reductions) == len(ch) - 1
	for i in range(num_down_blocks):
	layer = DownsampleBlock(
	ch[i],
	ch[i + 1],
	num_res_blocks=num_res_blocks[i + 1],
	temporal_reduction=temporal_reductions[i],
	spatial_reduction=spatial_reductions[i],
	prune_bottleneck=prune_bottlenecks[i],
	has_attention=has_attentions[i],
	affine=affine,
	bias=bias,
	padding_mode=padding_mode,
	)

	layers.append(layer)

	# Additional blocks.
	for _ in range(num_res_blocks[-1]):
	layers.append(block(ch[-1], has_attention=has_attentions[-1], prune_bottleneck=prune_bottlenecks[-1]))

	self.layers = nn.Sequential(*layers)

	# Output layers.
	self.output_norm = norm_fn(ch[-1])
	self.output_proj = Conv1x1(ch[-1], 2 * latent_dim, bias=False)

	@property
	def temporal_downsample(self):
	return math.prod(self.temporal_reductions)

	@property
	def spatial_downsample(self):
	return math.prod(self.spatial_reductions)

	def forward(self, x) -> LatentDistribution:
	"""Forward pass.

	Args:
	x: Input video tensor. Shape: [B, C, T, H, W]. Scaled to [-1, 1]

	Returns:
	means: Latent tensor. Shape: [B, latent_dim, t, h, w]. Scaled [-1, 1].
	h = H // 8, w = W // 8, t - 1 = (T - 1) // 6
	logvar: Shape: [B, latent_dim, t, h, w].
	"""
	assert x.ndim == 5, f"Expected 5D input, got {x.shape}"

	x = self.layers(x)

	x = self.output_norm(x)
	x = F.silu(x, inplace=True)
	x = self.output_proj(x)

	means, logvar = torch.chunk(x, 2, dim=1)

	assert means.ndim == 5
	assert logvar.shape == means.shape
	assert means.size(1) == self.latent_dim

	return LatentDistribution(means, logvar)


	def normalize_decoded_frames(samples):
	samples = samples.float()
	samples = (samples + 1.0) / 2.0
	samples.clamp_(0.0, 1.0)
	frames = rearrange(samples, "b c t h w -> b t h w c")
	return frames

	@torch.inference_mode()
	def decode_latents_tiled_full(
	decoder,
	z,
	*,
	tile_sample_min_height: int = 240,
	tile_sample_min_width: int = 424,
	tile_overlap_factor_height: float = 0.1666,
	tile_overlap_factor_width: float = 0.2,
	auto_tile_size: bool = True,
	frame_batch_size: int = 6,
	):
	B, C, T, H, W = z.shape
	assert frame_batch_size <= T, f"frame_batch_size must be <= T, got {frame_batch_size} > {T}"

	tile_sample_min_height = tile_sample_min_height if not auto_tile_size else H // 2 * 8
	tile_sample_min_width = tile_sample_min_width if not auto_tile_size else W // 2 * 8

	tile_latent_min_height = int(tile_sample_min_height / 8)
	tile_latent_min_width = int(tile_sample_min_width / 8)

	def blend_v(a: torch.Tensor, b: torch.Tensor, blend_extent: int) -> torch.Tensor:
	blend_extent = min(a.shape[3], b.shape[3], blend_extent)
	for y in range(blend_extent):
	b[:, :, :, y, :] = a[:, :, :, -blend_extent + y, :] * (1 - y / blend_extent) + b[:, :, :, y, :] * (
	y / blend_extent
	)
	return b

	def blend_h(a: torch.Tensor, b: torch.Tensor, blend_extent: int) -> torch.Tensor:
	blend_extent = min(a.shape[4], b.shape[4], blend_extent)
	for x in range(blend_extent):
	b[:, :, :, :, x] = a[:, :, :, :, -blend_extent + x] * (1 - x / blend_extent) + b[:, :, :, :, x] * (
	x / blend_extent
	)
	return b

	overlap_height = int(tile_latent_min_height * (1 - tile_overlap_factor_height))
	overlap_width = int(tile_latent_min_width * (1 - tile_overlap_factor_width))
	blend_extent_height = int(tile_sample_min_height * tile_overlap_factor_height)
	blend_extent_width = int(tile_sample_min_width * tile_overlap_factor_width)
	row_limit_height = tile_sample_min_height - blend_extent_height
	row_limit_width = tile_sample_min_width - blend_extent_width

	# Split z into overlapping tiles and decode them separately.
	# The tiles have an overlap to avoid seams between tiles.
	pbar = get_new_progress_bar(
	desc="Decoding latent tiles",
	total=len(range(0, H, overlap_height)) * len(range(0, W, overlap_width)) * len(range(T // frame_batch_size)),
	)
	rows = []
	for i in range(0, H, overlap_height):
	row = []
	for j in range(0, W, overlap_width):
	temporal = []
	for k in range(T // frame_batch_size):
	remaining_frames = T % frame_batch_size
	start_frame = frame_batch_size * k + (0 if k == 0 else remaining_frames)
	end_frame = frame_batch_size * (k + 1) + remaining_frames
	tile = z[
	:,
	:,
	start_frame:end_frame,
	i : i + tile_latent_min_height,
	j : j + tile_latent_min_width,
	]
	tile = decoder(tile)
	temporal.append(tile)
	pbar.update(1)
	row.append(torch.cat(temporal, dim=2))
	rows.append(row)

	result_rows = []
	for i, row in enumerate(rows):
	result_row = []
	for j, tile in enumerate(row):
	# blend the above tile and the left tile
	# to the current tile and add the current tile to the result row
	if i > 0:
	tile = blend_v(rows[i - 1][j], tile, blend_extent_height)
	if j > 0:
	tile = blend_h(row[j - 1], tile, blend_extent_width)
	result_row.append(tile[:, :, :, :row_limit_height, :row_limit_width])
	result_rows.append(torch.cat(result_row, dim=4))

	return normalize_decoded_frames(torch.cat(result_rows, dim=3))


	@torch.inference_mode()
	def decode_latents_tiled_spatial(
	decoder,
	z,
	*,
	num_tiles_w: int,
	num_tiles_h: int,
	overlap: int = 0, # Number of pixel of overlap between adjacent tiles.
	# Use a factor of 2 times the latent downsample factor.
	min_block_size: int = 1, # Minimum number of pixels in each dimension when subdividing.
	):
	decoded = apply_tiled(decoder, z, num_tiles_w, num_tiles_h, overlap, min_block_size)
	assert decoded is not None, f"Failed to decode latents with tiled spatial method"
	return normalize_decoded_frames(decoded)

	@torch.inference_mode()
	def decode_latents(decoder, z):
	cp_rank, cp_size = cp.get_cp_rank_size()
	# import ipdb; ipdb.set_trace()
	z = z.tensor_split(cp_size, dim=2)[cp_rank] # split along temporal dim
	with torch.autocast("cuda", dtype=torch.bfloat16):
	samples = decoder(z)
	samples = cp_conv.gather_all_frames(samples)
	return normalize_decoded_frames(samples)

	@torch.inference_mode()
	def encode_latents(encoder, x):
	cp_rank, cp_size = cp.get_cp_rank_size()

	# print(f"encode_latents: cp_rank: {cp_rank}, cp_size: {cp_size}, x.shape: {x.shape}")
	x = x.tensor_split(cp_size, dim=2)[cp_rank] # split along temporal dim
	# print(f"encode_latents: cp_rank: {cp_rank}, cp_size: {cp_size}, x.shape: {x.shape}")
	with torch.autocast("cuda", dtype=torch.bfloat16):
	z = encoder(x)
	z = cp_conv.gather_all_frames(z.sample())
	return z

	# @torch.inference_mode()
	# def encode_latents(encoder, x):
	# # First gather the input across data parallel processes to ensure consistency
	# if dist.is_initialized():
	# world_size = dist.get_world_size()
	# rank = dist.get_rank()
	# gathered_x = [torch.zeros_like(x) for _ in range(world_size)]
	# dist.all_gather(gathered_x, x)
	# # Use the first process's data to ensure consistency
	# x = gathered_x[0]

	# # Now split along temporal dimension for context parallelism
	# cp_rank, cp_size = cp.get_cp_rank_size()
	# x = x.tensor_split(cp_size, dim=2)[cp_rank]

	# with torch.autocast("cuda", dtype=torch.bfloat16):
	# z = encoder(x)

	# # Gather across CP ranks
	# z = cp_conv.gather_all_frames(z.sample())

	# # Redistribute the results back to data parallel processes
	# if dist.is_initialized():
	# # Split the result back according to the original batch size
	# z_splits = z.chunk(world_size, dim=0)
	# # Each process takes its corresponding split
	# z = z_splits[rank]

	# return z


	if __name__ == "__main__":
	import time
	import torch
	import torchvision
	from einops import rearrange
	from safetensors.torch import load_file
	import os

	# Enable TF32
	torch.backends.cuda.matmul.allow_tf32 = True
	torch.backends.cudnn.allow_tf32 = True

	# Configuration matching the official test
	config = dict(
	in_channels=15,
	base_channels=64,
	channel_multipliers=[1, 2, 4, 6],
	num_res_blocks=[3, 3, 4, 6, 3],
	latent_dim=12,
	temporal_reductions=[1, 2, 3],
	spatial_reductions=[2, 2, 2],
	prune_bottlenecks=[False, False, False, False, False],
	has_attentions=[False, True, True, True, True],
	affine=True,
	bias=True,
	input_is_conv_1x1=True,
	padding_mode="replicate"
	)

	# Initialize encoder
	encoder = Encoder(**config)

	# Setup device and load weights
	device = torch.device("cuda:0")
	encoder = encoder.to(device, memory_format=torch.channels_last_3d)

	# Load weights
	weights_path = "/XYFS01/nudt_ljqu_1/Raphael/mochi-1-preview/encoder.safetensors"
	if not os.path.exists(weights_path):
	print(f"Error: Weights file not found at {weights_path}")
	exit(1)

	encoder.load_state_dict(load_file(weights_path))
	encoder.eval()

	# Create a test input (simulating video data)
	B, C, T, H, W = 1, 3, 16, 256, 256
	test_video = torch.randint(0, 255, (B, C, T, H, W), dtype=torch.uint8, device=device)

	# Convert to float in [-1, 1] range
	test_video = test_video.float() / 127.5 - 1.0

	# Add Fourier features
	test_video = add_fourier_features(test_video)
	torch.cuda.synchronize()

	# Encode video to latent
	with torch.inference_mode():
	with torch.autocast("cuda", dtype=torch.bfloat16):
	print("\nTesting encoder...")
	t0 = time.time()
	latent_dist = encoder(test_video)
	torch.cuda.synchronize()
	encode_time = time.time() - t0
	print(f"Time to encode: {encode_time:.2f}s")

	# Print shapes and statistics
	mean = latent_dist.mean
	logvar = latent_dist.logvar

	print(f"\nInput shape: {test_video.shape}")
	print(f"Latent mean shape: {mean.shape}")
	print(f"Latent logvar shape: {logvar.shape}")

	# Print statistics
	print("\nMean statistics:")
	print(f"Min: {mean.min().item():.4f}")
	print(f"Max: {mean.max().item():.4f}")
	print(f"Mean: {mean.mean().item():.4f}")
	print(f"Std: {mean.std().item():.4f}")

	print("\nLogvar statistics:")
	print(f"Min: {logvar.min().item():.4f}")
	print(f"Max: {logvar.max().item():.4f}")
	print(f"Mean: {logvar.mean().item():.4f}")
	print(f"Std: {logvar.std().item():.4f}")

	# Test sampling
	print("\nTesting sampling from latent distribution...")
	sampled = latent_dist.sample()
	print(f"Sampled shape: {sampled.shape}")