Spaces:

houyuanchen
/

lino

Runtime error

lino / Stable3DGen /hi3dgen /models /structured_latent_vae /decoder_mesh.py

algohunt

initial_commit

c295391 2 months ago

7.14 kB

	# MIT License

	# Copyright (c) Microsoft

	# Permission is hereby granted, free of charge, to any person obtaining a copy
	# of this software and associated documentation files (the "Software"), to deal
	# in the Software without restriction, including without limitation the rights
	# to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
	# copies of the Software, and to permit persons to whom the Software is
	# furnished to do so, subject to the following conditions:

	# The above copyright notice and this permission notice shall be included in all
	# copies or substantial portions of the Software.

	# THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
	# IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
	# FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
	# AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
	# LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
	# OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
	# SOFTWARE.

	# Copyright (c) [2025] [Microsoft]
	# Copyright (c) [2025] [Chongjie Ye]
	# SPDX-License-Identifier: MIT
	# This file has been modified by Chongjie Ye on 2025/04/10
	# Original file was released under MIT, with the full license text # available at https://github.com/atong01/conditional-flow-matching/blob/1.0.7/LICENSE.
	# This modified file is released under the same license.
	from typing import *
	import torch
	import torch.nn as nn
	import torch.nn.functional as F
	import numpy as np
	from ...modules.utils import zero_module, convert_module_to_f16, convert_module_to_f32
	from ...modules import sparse as sp
	from .base import SparseTransformerBase
	from ...representations import MeshExtractResult
	from ...representations.mesh import SparseFeatures2Mesh


	class SparseSubdivideBlock3d(nn.Module):
	"""
	A 3D subdivide block that can subdivide the sparse tensor.

	Args:
	channels: channels in the inputs and outputs.
	out_channels: if specified, the number of output channels.
	num_groups: the number of groups for the group norm.
	"""
	def __init__(
	self,
	channels: int,
	resolution: int,
	out_channels: Optional[int] = None,
	num_groups: int = 32
	):
	super().__init__()
	self.channels = channels
	self.resolution = resolution
	self.out_resolution = resolution * 2
	self.out_channels = out_channels or channels

	self.act_layers = nn.Sequential(
	sp.SparseGroupNorm32(num_groups, channels),
	sp.SparseSiLU()
	)

	self.sub = sp.SparseSubdivide()

	self.out_layers = nn.Sequential(
	sp.SparseConv3d(channels, self.out_channels, 3, indice_key=f"res_{self.out_resolution}"),
	sp.SparseGroupNorm32(num_groups, self.out_channels),
	sp.SparseSiLU(),
	zero_module(sp.SparseConv3d(self.out_channels, self.out_channels, 3, indice_key=f"res_{self.out_resolution}")),
	)

	if self.out_channels == channels:
	self.skip_connection = nn.Identity()
	else:
	self.skip_connection = sp.SparseConv3d(channels, self.out_channels, 1, indice_key=f"res_{self.out_resolution}")

	def forward(self, x: sp.SparseTensor) -> sp.SparseTensor:
	"""
	Apply the block to a Tensor, conditioned on a timestep embedding.

	Args:
	x: an [N x C x ...] Tensor of features.
	Returns:
	an [N x C x ...] Tensor of outputs.
	"""
	h = self.act_layers(x)
	h = self.sub(h)
	x = self.sub(x)
	h = self.out_layers(h)
	h = h + self.skip_connection(x)
	return h


	class SLatMeshDecoder(SparseTransformerBase):
	def __init__(
	self,
	resolution: int,
	model_channels: int,
	latent_channels: int,
	num_blocks: int,
	num_heads: Optional[int] = None,
	num_head_channels: Optional[int] = 64,
	mlp_ratio: float = 4,
	attn_mode: Literal["full", "shift_window", "shift_sequence", "shift_order", "swin"] = "swin",
	window_size: int = 8,
	pe_mode: Literal["ape", "rope"] = "ape",
	use_fp16: bool = False,
	use_checkpoint: bool = False,
	qk_rms_norm: bool = False,
	representation_config: dict = None,
	):
	super().__init__(
	in_channels=latent_channels,
	model_channels=model_channels,
	num_blocks=num_blocks,
	num_heads=num_heads,
	num_head_channels=num_head_channels,
	mlp_ratio=mlp_ratio,
	attn_mode=attn_mode,
	window_size=window_size,
	pe_mode=pe_mode,
	use_fp16=use_fp16,
	use_checkpoint=use_checkpoint,
	qk_rms_norm=qk_rms_norm,
	)
	self.resolution = resolution
	self.rep_config = representation_config
	self.mesh_extractor = SparseFeatures2Mesh(res=self.resolution*4, use_color=self.rep_config.get('use_color', False))
	self.out_channels = self.mesh_extractor.feats_channels
	self.upsample = nn.ModuleList([
	SparseSubdivideBlock3d(
	channels=model_channels,
	resolution=resolution,
	out_channels=model_channels // 4
	),
	SparseSubdivideBlock3d(
	channels=model_channels // 4,
	resolution=resolution * 2,
	out_channels=model_channels // 8
	)
	])
	self.out_layer = sp.SparseLinear(model_channels // 8, self.out_channels)

	self.initialize_weights()
	if use_fp16:
	self.convert_to_fp16()

	def initialize_weights(self) -> None:
	super().initialize_weights()
	# Zero-out output layers:
	nn.init.constant_(self.out_layer.weight, 0)
	nn.init.constant_(self.out_layer.bias, 0)

	def convert_to_fp16(self) -> None:
	"""
	Convert the torso of the model to float16.
	"""
	super().convert_to_fp16()
	self.upsample.apply(convert_module_to_f16)

	def convert_to_fp32(self) -> None:
	"""
	Convert the torso of the model to float32.
	"""
	super().convert_to_fp32()
	self.upsample.apply(convert_module_to_f32)

	def to_representation(self, x: sp.SparseTensor) -> List[MeshExtractResult]:
	"""
	Convert a batch of network outputs to 3D representations.

	Args:
	x: The [N x * x C] sparse tensor output by the network.

	Returns:
	list of representations
	"""
	ret = []
	for i in range(x.shape[0]):
	mesh = self.mesh_extractor(x[i], training=self.training)
	ret.append(mesh)
	return ret

	def forward(self, x: sp.SparseTensor) -> List[MeshExtractResult]:
	h = super().forward(x)
	for block in self.upsample:
	h = block(h)
	h = h.type(x.dtype)
	h = self.out_layer(h)
	return self.to_representation(h)