Spaces:
Build error
Build error
# Tencent HunyuanWorld-1.0 is licensed under TENCENT HUNYUANWORLD-1.0 COMMUNITY LICENSE AGREEMENT | |
# THIS LICENSE AGREEMENT DOES NOT APPLY IN THE EUROPEAN UNION, UNITED KINGDOM AND SOUTH KOREA AND | |
# IS EXPRESSLY LIMITED TO THE TERRITORY, AS DEFINED BELOW. | |
# By clicking to agree or by using, reproducing, modifying, distributing, performing or displaying | |
# any portion or element of the Tencent HunyuanWorld-1.0 Works, including via any Hosted Service, | |
# You will be deemed to have recognized and accepted the content of this Agreement, | |
# which is effective immediately. | |
# For avoidance of doubts, Tencent HunyuanWorld-1.0 means the 3D generation models | |
# and their software and algorithms, including trained model weights, parameters (including | |
# optimizer states), machine-learning model code, inference-enabling code, training-enabling code, | |
# fine-tuning enabling code and other elements of the foregoing made publicly available | |
# by Tencent at [https://github.com/Tencent-Hunyuan/HunyuanWorld-1.0]. | |
import torch | |
from typing import List, Dict, Tuple | |
class AdaptiveDepthCompressor: | |
r""" | |
Adaptive depth compressor to solve the problem of excessive background depth variance | |
in 3D world generation. This class provides methods to compress background and foreground | |
depth values based on statistical analysis of depth distributions, with options for | |
smooth compression and outlier removal. | |
Args: | |
cv_thresholds: Tuple of (low, high) thresholds for coefficient of variation (CV). | |
compression_quantiles: Tuple of (low, medium, high) quantiles for depth compression. | |
fg_bg_depth_margin: Margin factor to ensure foreground depth is greater than background. | |
enable_smooth_compression: Whether to use smooth compression instead of hard truncation. | |
outlier_removal_method: Method for outlier removal, options are "iqr", "quantile", or "none". | |
min_compression_depth: Minimum depth threshold for compression to be applied. | |
""" | |
def __init__( | |
self, | |
cv_thresholds: Tuple[float, float] = (0.3, 0.8), | |
compression_quantiles: Tuple[float, float, float] = (0.95, 0.92, 0.85), | |
fg_bg_depth_margin: float = 1.1, | |
enable_smooth_compression: bool = True, | |
outlier_removal_method: str = "iqr", | |
min_compression_depth: float = 6.0, | |
): | |
self.cv_thresholds = cv_thresholds | |
self.compression_quantiles = compression_quantiles | |
self.fg_bg_depth_margin = fg_bg_depth_margin | |
self.enable_smooth_compression = enable_smooth_compression | |
self.outlier_removal_method = outlier_removal_method | |
self.min_compression_depth = min_compression_depth | |
def _remove_outliers(self, depth_vals: torch.Tensor) -> torch.Tensor: | |
r""" | |
Remove outliers from depth values | |
based on the specified method (IQR or quantile). | |
Args: | |
depth_vals: Tensor of depth values to process. | |
Returns: | |
Tensor of depth values with outliers removed. | |
""" | |
if self.outlier_removal_method == "iqr": | |
q25, q75 = torch.quantile(depth_vals, torch.tensor( | |
[0.25, 0.75], device=depth_vals.device)) | |
iqr = q75 - q25 | |
lower_bound, upper_bound = q25 - 1.5 * iqr, q75 + 1.5 * iqr | |
valid_mask = (depth_vals >= lower_bound) & ( | |
depth_vals <= upper_bound) | |
elif self.outlier_removal_method == "quantile": | |
q05, q95 = torch.quantile(depth_vals, torch.tensor( | |
[0.05, 0.95], device=depth_vals.device)) | |
valid_mask = (depth_vals >= q05) & (depth_vals <= q95) | |
else: | |
return depth_vals | |
return depth_vals[valid_mask] if valid_mask.sum() > 0 else depth_vals | |
def _collect_foreground_depths( | |
self, | |
layered_world_depth: List[Dict] | |
) -> List[torch.Tensor]: | |
r""" | |
Collect depth information of all foreground layers (remove outliers) | |
from the layered world depth representation. | |
Args: | |
layered_world_depth: List of dictionaries containing depth information for each layer. | |
Returns: | |
List of tensors containing cleaned foreground depth values. | |
""" | |
fg_depths = [] | |
for layer_depth in layered_world_depth: | |
if layer_depth["name"] == "background": | |
continue | |
depth_vals = layer_depth["distance"] | |
mask = layer_depth.get("mask", None) | |
# Process the depth values within the mask area | |
if mask is not None: | |
if not isinstance(mask, torch.Tensor): | |
mask = torch.from_numpy(mask).to(depth_vals.device) | |
depth_vals = depth_vals[mask.bool()] | |
if depth_vals.numel() > 0: | |
cleaned_depths = self._remove_outliers(depth_vals) | |
if len(cleaned_depths) > 0: | |
fg_depths.append(cleaned_depths) | |
return fg_depths | |
def _get_pixelwise_foreground_max_depth( | |
self, | |
layered_world_depth: List[Dict], | |
bg_shape: torch.Size, | |
bg_device: torch.device | |
) -> torch.Tensor: | |
r""" | |
Calculate the maximum foreground depth for each pixel position | |
Args: | |
layered_world_depth: List of dictionaries containing depth information for each layer. | |
bg_shape: Shape of the background depth tensor. | |
bg_device: Device where the background depth tensor is located. | |
Returns: | |
Tensor of maximum foreground depth values for each pixel position. | |
""" | |
fg_max_depth = torch.zeros(bg_shape, device=bg_device) | |
for layer_depth in layered_world_depth: | |
if layer_depth["name"] == "background": | |
continue | |
layer_distance = layer_depth["distance"] | |
layer_mask = layer_depth.get("mask", None) | |
# Ensure that the tensor is on the correct device | |
if not isinstance(layer_distance, torch.Tensor): | |
layer_distance = torch.from_numpy(layer_distance).to(bg_device) | |
else: | |
layer_distance = layer_distance.to(bg_device) | |
# Update the maximum depth of the foreground | |
if layer_mask is not None: | |
if not isinstance(layer_mask, torch.Tensor): | |
layer_mask = torch.from_numpy(layer_mask).to(bg_device) | |
else: | |
layer_mask = layer_mask.to(bg_device) | |
fg_max_depth = torch.where(layer_mask.bool(), torch.max( | |
fg_max_depth, layer_distance), fg_max_depth) | |
else: | |
fg_max_depth = torch.max(fg_max_depth, layer_distance) | |
return fg_max_depth | |
def _analyze_depth_distribution(self, bg_depth_distance: torch.Tensor) -> Dict: | |
r""" | |
Analyze the distribution characteristics of background depth | |
Args: | |
bg_depth_distance: Tensor of background depth distances. | |
Returns: | |
Dictionary containing statistical properties of the background depth distribution. | |
""" | |
bg_mean, bg_std = torch.mean( | |
bg_depth_distance), torch.std(bg_depth_distance) | |
cv = bg_std / bg_mean | |
quantiles = torch.quantile(bg_depth_distance, | |
torch.tensor([0.5, 0.75, 0.9, 0.95, 0.99], device=bg_depth_distance.device)) | |
bg_q50, bg_q75, bg_q90, bg_q95, bg_q99 = quantiles | |
return {"mean": bg_mean, "std": bg_std, "cv": cv, "q50": bg_q50, | |
"q75": bg_q75, "q90": bg_q90, "q95": bg_q95, "q99": bg_q99} | |
def _determine_compression_strategy(self, cv: float) -> Tuple[str, float]: | |
r""" | |
Determine compression strategy based on coefficient of variation | |
Args: | |
cv: Coefficient of variation of the background depth distribution. | |
Returns: | |
Tuple containing the compression strategy ("gentle", "standard", "aggressive") | |
and the quantile to use for compression. | |
""" | |
low_cv_threshold, high_cv_threshold = self.cv_thresholds | |
low_var_quantile, medium_var_quantile, high_var_quantile = self.compression_quantiles | |
if cv < low_cv_threshold: | |
return "gentle", low_var_quantile | |
elif cv > high_cv_threshold: | |
return "aggressive", high_var_quantile | |
else: | |
return "standard", medium_var_quantile | |
def _smooth_compression(self, depth_values: torch.Tensor, max_depth: torch.Tensor, | |
mask: torch.Tensor = None, transition_start_ratio: float = 0.95, | |
transition_range_ratio: float = 0.2, verbose: bool = False) -> torch.Tensor: | |
r""" | |
Use smooth compression function instead of hard truncation | |
Args: | |
depth_values: Tensor of depth values to compress. | |
max_depth: Maximum depth value for compression. | |
mask: Optional mask to apply compression only to certain pixels. | |
transition_start_ratio: Ratio to determine the start of the transition range. | |
transition_range_ratio: Ratio to determine the range of the transition. | |
verbose: Whether to print detailed information about the compression process. | |
Returns: | |
Compressed depth values as a tensor. | |
""" | |
if not self.enable_smooth_compression: | |
compressed = depth_values.clone() | |
if mask is not None: | |
compressed[mask] = torch.clamp( | |
depth_values[mask], max=max_depth) | |
else: | |
compressed = torch.clamp(depth_values, max=max_depth) | |
return compressed | |
transition_start = max_depth * transition_start_ratio | |
transition_range = max_depth * transition_range_ratio | |
compressed_depth = depth_values.clone() | |
mask_far = depth_values > transition_start | |
if mask is not None: | |
mask_far = mask_far & mask | |
if mask_far.sum() > 0: | |
far_depths = depth_values[mask_far] | |
normalized = (far_depths - transition_start) / transition_range | |
compressed_normalized = torch.sigmoid( | |
normalized * 2 - 1) * 0.5 + 0.5 | |
compressed_far = transition_start + \ | |
compressed_normalized * (max_depth - transition_start) | |
compressed_depth[mask_far] = compressed_far | |
if verbose: | |
print( | |
f"\t Applied smooth compression to {mask_far.sum()} pixels beyond {transition_start:.2f}") | |
elif verbose: | |
print(f"\t No compression needed, all depths within reasonable range") | |
return compressed_depth | |
def compress_background_depth(self, bg_depth_distance: torch.Tensor, layered_world_depth: List[Dict], | |
bg_mask: torch.Tensor, verbose: bool = False) -> torch.Tensor: | |
r""" | |
Adaptive compression of background depth values | |
Args: | |
bg_depth_distance: Tensor of background depth distances. | |
layered_world_depth: List of dictionaries containing depth information for each layer. | |
bg_mask: Tensor or numpy array representing the mask for background depth. | |
verbose: Whether to print detailed information about the compression process. | |
Returns: | |
Compressed background depth values as a tensor. | |
""" | |
if verbose: | |
print(f"\t - Applying adaptive depth compression...") | |
# Process mask | |
if not isinstance(bg_mask, torch.Tensor): | |
bg_mask = torch.from_numpy(bg_mask).to(bg_depth_distance.device) | |
mask_bool = bg_mask.bool() | |
masked_depths = bg_depth_distance[mask_bool] | |
if masked_depths.numel() == 0: | |
if verbose: | |
print(f"\t No valid depths in mask region, skipping compression") | |
return bg_depth_distance | |
# 1. Collect prospect depth information | |
fg_depths = self._collect_foreground_depths(layered_world_depth) | |
# 2. Calculate prospect depth statistics | |
if fg_depths: | |
all_fg_depths = torch.cat(fg_depths) | |
fg_max = torch.quantile(all_fg_depths, torch.tensor( | |
0.99, device=all_fg_depths.device)) | |
if verbose: | |
print( | |
f"\t Foreground depth stats - 99th percentile: {fg_max:.2f}") | |
else: | |
fg_max = torch.quantile(masked_depths, torch.tensor( | |
0.5, device=masked_depths.device)) | |
if verbose: | |
print(f"\t No foreground found, using background stats for reference") | |
# 3. Analyze the depth distribution of the background | |
depth_stats = self._analyze_depth_distribution(masked_depths) | |
if verbose: | |
print( | |
f"\t Background depth stats - mean: {depth_stats['mean']:.2f}, \ | |
std: {depth_stats['std']:.2f}, CV: {depth_stats['cv']:.3f}") | |
# 4. Determine compression strategy | |
strategy, compression_quantile = self._determine_compression_strategy( | |
depth_stats['cv']) | |
max_depth = torch.quantile(masked_depths, torch.tensor( | |
compression_quantile, device=masked_depths.device)) | |
if verbose: | |
print( | |
f"\t {strategy.capitalize()} compression strategy \ | |
(CV={depth_stats['cv']:.3f}), quantile={compression_quantile}") | |
# 5. Pixel level depth constraint | |
if fg_depths: | |
fg_max_depth_pixelwise = self._get_pixelwise_foreground_max_depth( | |
layered_world_depth, bg_depth_distance.shape, bg_depth_distance.device) | |
required_min_bg_depth = fg_max_depth_pixelwise * self.fg_bg_depth_margin | |
pixelwise_violations = ( | |
bg_depth_distance < required_min_bg_depth) & mask_bool | |
if pixelwise_violations.sum() > 0: | |
violation_ratio = pixelwise_violations.float().sum() / mask_bool.float().sum() | |
violated_required_depths = required_min_bg_depth[pixelwise_violations] | |
pixelwise_min_depth = torch.quantile(violated_required_depths, torch.tensor( | |
0.99, device=violated_required_depths.device)) | |
max_depth = torch.max(max_depth, pixelwise_min_depth) | |
if verbose: | |
print( | |
f"\t Pixelwise constraint violation: {violation_ratio:.1%}, \ | |
adjusted max depth to {max_depth:.2f}") | |
elif verbose: | |
print(f"\t Pixelwise depth constraints satisfied") | |
# 6. Global statistical constraints | |
if fg_depths: | |
min_bg_depth = fg_max * self.fg_bg_depth_margin | |
max_depth = torch.max(max_depth, min_bg_depth) | |
if verbose: | |
print(f"\t Final max depth: {max_depth:.2f}") | |
# 6.5. Depth threshold check: If max_depth is less than the threshold, skip compression | |
if max_depth < self.min_compression_depth: | |
if verbose: | |
print( | |
f"\t Max depth {max_depth:.2f} is below threshold \ | |
{self.min_compression_depth:.2f}, skipping compression") | |
return bg_depth_distance | |
# 7. Application compression | |
compressed_depth = self._smooth_compression( | |
bg_depth_distance, max_depth, mask_bool, 0.9, 0.2, verbose) | |
# 8. Hard truncation of extreme outliers | |
final_max = max_depth * 1.2 | |
outliers = (compressed_depth > final_max) & mask_bool | |
if outliers.sum() > 0: | |
compressed_depth[outliers] = final_max | |
# 9. statistic | |
compression_ratio = ((bg_depth_distance > max_depth) | |
& mask_bool).float().sum() / mask_bool.float().sum() | |
if verbose: | |
print( | |
f"\t Compression summary - max depth: \ | |
{max_depth:.2f}, affected: {compression_ratio:.1%}") | |
return compressed_depth | |
def compress_foreground_depth( | |
self, | |
fg_depth_distance: torch.Tensor, | |
fg_mask: torch.Tensor, | |
verbose: bool = False, | |
conservative_ratio: float = 0.99, | |
iqr_scale: float = 2 | |
) -> torch.Tensor: | |
r""" | |
Conservatively compress outliers for foreground depth | |
Args: | |
fg_depth_distance: Tensor of foreground depth distances. | |
fg_mask: Tensor or numpy array representing the mask for foreground depth. | |
verbose: Whether to print detailed information about the compression process. | |
conservative_ratio: Ratio to use for conservative compression. | |
iqr_scale: Scale factor for IQR-based upper bound. | |
Returns: | |
Compressed foreground depth values as a tensor. | |
""" | |
if verbose: | |
print(f"\t - Applying conservative foreground depth compression...") | |
# Process mask | |
if not isinstance(fg_mask, torch.Tensor): | |
fg_mask = torch.from_numpy(fg_mask).to(fg_depth_distance.device) | |
mask_bool = fg_mask.bool() | |
masked_depths = fg_depth_distance[mask_bool] | |
if masked_depths.numel() == 0: | |
if verbose: | |
print(f"\t No valid depths in mask region, skipping compression") | |
return fg_depth_distance | |
# Calculate statistical information | |
depth_mean, depth_std = torch.mean( | |
masked_depths), torch.std(masked_depths) | |
# Determine the upper bound using IQR and quantile methods | |
q25, q75 = torch.quantile(masked_depths, torch.tensor( | |
[0.25, 0.75], device=masked_depths.device)) | |
iqr = q75 - q25 | |
upper_bound = q75 + iqr_scale * iqr | |
conservative_max = torch.quantile(masked_depths, torch.tensor( | |
conservative_ratio, device=masked_depths.device)) | |
final_max = torch.max(upper_bound, conservative_max) | |
# Statistical Outliers | |
outliers = (fg_depth_distance > final_max) & mask_bool | |
outlier_count = outliers.sum().item() | |
if verbose: | |
print( | |
f"\t Depth stats - mean: {depth_mean:.2f}, std: {depth_std:.2f}") | |
print( | |
f"\t IQR bounds - Q25: {q25:.2f}, Q75: {q75:.2f}, upper: {upper_bound:.2f}") | |
print( | |
f"\t Conservative max: {conservative_max:.2f}, final max: {final_max:.2f}") | |
print( | |
f"\t Outliers: {outlier_count} ({(outlier_count/masked_depths.numel()*100):.2f}%)") | |
# Depth threshold check: If final_max is less than the threshold, skip compression | |
if final_max < self.min_compression_depth: | |
if verbose: | |
print( | |
f"\t Final max depth {final_max:.2f} is below threshold \ | |
{self.min_compression_depth:.2f}, skipping compression") | |
return fg_depth_distance | |
# Apply compression | |
if outlier_count > 0: | |
compressed_depth = self._smooth_compression( | |
fg_depth_distance, final_max, mask_bool, 0.99, 0.1, verbose) | |
else: | |
compressed_depth = fg_depth_distance.clone() | |
return compressed_depth | |
def create_adaptive_depth_compressor( | |
scene_type: str = "auto", | |
enable_smooth_compression: bool = True, | |
outlier_removal_method: str = "iqr", | |
min_compression_depth: float = 6.0, # Minimum compression depth threshold | |
) -> AdaptiveDepthCompressor: | |
r""" | |
Create adaptive depth compressors suitable for different scene types | |
Args: | |
scene_type: Scenario Type ("indoor", "outdoor", "mixed", "auto") | |
enable_smooth_compression: enable smooth compression or not | |
outlier_removal_method: Outlier removal method ("iqr", "quantile", "none") | |
""" | |
common_params = { | |
"enable_smooth_compression": enable_smooth_compression, | |
"outlier_removal_method": outlier_removal_method, | |
"min_compression_depth": min_compression_depth, | |
} | |
if scene_type == "indoor": | |
# Indoor scene: Depth variation is relatively small, conservative compression is used | |
return AdaptiveDepthCompressor( | |
cv_thresholds=(0.2, 0.6), | |
compression_quantiles=(1.0, 0.975, 0.95), | |
fg_bg_depth_margin=1.05, | |
**common_params | |
) | |
elif scene_type == "outdoor": | |
# Outdoor scenes: There may be sky, distant mountains, etc., using more aggressive compression | |
return AdaptiveDepthCompressor( | |
cv_thresholds=(0.4, 1.0), | |
compression_quantiles=(0.98, 0.955, 0.93), | |
fg_bg_depth_margin=1.15, | |
**common_params | |
) | |
elif scene_type == "mixed": | |
# Mixed Scene: Balanced Settings | |
return AdaptiveDepthCompressor( | |
cv_thresholds=(0.3, 0.8), | |
compression_quantiles=(0.99, 0.97, 0.95), | |
fg_bg_depth_margin=1.1, | |
**common_params | |
) | |
else: # auto | |
# Automatic mode: Use default settings | |
return AdaptiveDepthCompressor(**common_params) | |