# Tencent HunyuanWorld-1.0 is licensed under TENCENT HUNYUANWORLD-1.0 COMMUNITY LICENSE AGREEMENT # THIS LICENSE AGREEMENT DOES NOT APPLY IN THE EUROPEAN UNION, UNITED KINGDOM AND SOUTH KOREA AND # IS EXPRESSLY LIMITED TO THE TERRITORY, AS DEFINED BELOW. # By clicking to agree or by using, reproducing, modifying, distributing, performing or displaying # any portion or element of the Tencent HunyuanWorld-1.0 Works, including via any Hosted Service, # You will be deemed to have recognized and accepted the content of this Agreement, # which is effective immediately. # For avoidance of doubts, Tencent HunyuanWorld-1.0 means the 3D generation models # and their software and algorithms, including trained model weights, parameters (including # optimizer states), machine-learning model code, inference-enabling code, training-enabling code, # fine-tuning enabling code and other elements of the foregoing made publicly available # by Tencent at [https://github.com/Tencent-Hunyuan/HunyuanWorld-1.0]. import torch from typing import List, Dict, Tuple class AdaptiveDepthCompressor: r""" Adaptive depth compressor to solve the problem of excessive background depth variance in 3D world generation. This class provides methods to compress background and foreground depth values based on statistical analysis of depth distributions, with options for smooth compression and outlier removal. Args: cv_thresholds: Tuple of (low, high) thresholds for coefficient of variation (CV). compression_quantiles: Tuple of (low, medium, high) quantiles for depth compression. fg_bg_depth_margin: Margin factor to ensure foreground depth is greater than background. enable_smooth_compression: Whether to use smooth compression instead of hard truncation. outlier_removal_method: Method for outlier removal, options are "iqr", "quantile", or "none". min_compression_depth: Minimum depth threshold for compression to be applied. """ def __init__( self, cv_thresholds: Tuple[float, float] = (0.3, 0.8), compression_quantiles: Tuple[float, float, float] = (0.95, 0.92, 0.85), fg_bg_depth_margin: float = 1.1, enable_smooth_compression: bool = True, outlier_removal_method: str = "iqr", min_compression_depth: float = 6.0, ): self.cv_thresholds = cv_thresholds self.compression_quantiles = compression_quantiles self.fg_bg_depth_margin = fg_bg_depth_margin self.enable_smooth_compression = enable_smooth_compression self.outlier_removal_method = outlier_removal_method self.min_compression_depth = min_compression_depth def _remove_outliers(self, depth_vals: torch.Tensor) -> torch.Tensor: r""" Remove outliers from depth values based on the specified method (IQR or quantile). Args: depth_vals: Tensor of depth values to process. Returns: Tensor of depth values with outliers removed. """ if self.outlier_removal_method == "iqr": q25, q75 = torch.quantile(depth_vals, torch.tensor( [0.25, 0.75], device=depth_vals.device)) iqr = q75 - q25 lower_bound, upper_bound = q25 - 1.5 * iqr, q75 + 1.5 * iqr valid_mask = (depth_vals >= lower_bound) & ( depth_vals <= upper_bound) elif self.outlier_removal_method == "quantile": q05, q95 = torch.quantile(depth_vals, torch.tensor( [0.05, 0.95], device=depth_vals.device)) valid_mask = (depth_vals >= q05) & (depth_vals <= q95) else: return depth_vals return depth_vals[valid_mask] if valid_mask.sum() > 0 else depth_vals def _collect_foreground_depths( self, layered_world_depth: List[Dict] ) -> List[torch.Tensor]: r""" Collect depth information of all foreground layers (remove outliers) from the layered world depth representation. Args: layered_world_depth: List of dictionaries containing depth information for each layer. Returns: List of tensors containing cleaned foreground depth values. """ fg_depths = [] for layer_depth in layered_world_depth: if layer_depth["name"] == "background": continue depth_vals = layer_depth["distance"] mask = layer_depth.get("mask", None) # Process the depth values within the mask area if mask is not None: if not isinstance(mask, torch.Tensor): mask = torch.from_numpy(mask).to(depth_vals.device) depth_vals = depth_vals[mask.bool()] if depth_vals.numel() > 0: cleaned_depths = self._remove_outliers(depth_vals) if len(cleaned_depths) > 0: fg_depths.append(cleaned_depths) return fg_depths def _get_pixelwise_foreground_max_depth( self, layered_world_depth: List[Dict], bg_shape: torch.Size, bg_device: torch.device ) -> torch.Tensor: r""" Calculate the maximum foreground depth for each pixel position Args: layered_world_depth: List of dictionaries containing depth information for each layer. bg_shape: Shape of the background depth tensor. bg_device: Device where the background depth tensor is located. Returns: Tensor of maximum foreground depth values for each pixel position. """ fg_max_depth = torch.zeros(bg_shape, device=bg_device) for layer_depth in layered_world_depth: if layer_depth["name"] == "background": continue layer_distance = layer_depth["distance"] layer_mask = layer_depth.get("mask", None) # Ensure that the tensor is on the correct device if not isinstance(layer_distance, torch.Tensor): layer_distance = torch.from_numpy(layer_distance).to(bg_device) else: layer_distance = layer_distance.to(bg_device) # Update the maximum depth of the foreground if layer_mask is not None: if not isinstance(layer_mask, torch.Tensor): layer_mask = torch.from_numpy(layer_mask).to(bg_device) else: layer_mask = layer_mask.to(bg_device) fg_max_depth = torch.where(layer_mask.bool(), torch.max( fg_max_depth, layer_distance), fg_max_depth) else: fg_max_depth = torch.max(fg_max_depth, layer_distance) return fg_max_depth def _analyze_depth_distribution(self, bg_depth_distance: torch.Tensor) -> Dict: r""" Analyze the distribution characteristics of background depth Args: bg_depth_distance: Tensor of background depth distances. Returns: Dictionary containing statistical properties of the background depth distribution. """ bg_mean, bg_std = torch.mean( bg_depth_distance), torch.std(bg_depth_distance) cv = bg_std / bg_mean quantiles = torch.quantile(bg_depth_distance, torch.tensor([0.5, 0.75, 0.9, 0.95, 0.99], device=bg_depth_distance.device)) bg_q50, bg_q75, bg_q90, bg_q95, bg_q99 = quantiles return {"mean": bg_mean, "std": bg_std, "cv": cv, "q50": bg_q50, "q75": bg_q75, "q90": bg_q90, "q95": bg_q95, "q99": bg_q99} def _determine_compression_strategy(self, cv: float) -> Tuple[str, float]: r""" Determine compression strategy based on coefficient of variation Args: cv: Coefficient of variation of the background depth distribution. Returns: Tuple containing the compression strategy ("gentle", "standard", "aggressive") and the quantile to use for compression. """ low_cv_threshold, high_cv_threshold = self.cv_thresholds low_var_quantile, medium_var_quantile, high_var_quantile = self.compression_quantiles if cv < low_cv_threshold: return "gentle", low_var_quantile elif cv > high_cv_threshold: return "aggressive", high_var_quantile else: return "standard", medium_var_quantile def _smooth_compression(self, depth_values: torch.Tensor, max_depth: torch.Tensor, mask: torch.Tensor = None, transition_start_ratio: float = 0.95, transition_range_ratio: float = 0.2, verbose: bool = False) -> torch.Tensor: r""" Use smooth compression function instead of hard truncation Args: depth_values: Tensor of depth values to compress. max_depth: Maximum depth value for compression. mask: Optional mask to apply compression only to certain pixels. transition_start_ratio: Ratio to determine the start of the transition range. transition_range_ratio: Ratio to determine the range of the transition. verbose: Whether to print detailed information about the compression process. Returns: Compressed depth values as a tensor. """ if not self.enable_smooth_compression: compressed = depth_values.clone() if mask is not None: compressed[mask] = torch.clamp( depth_values[mask], max=max_depth) else: compressed = torch.clamp(depth_values, max=max_depth) return compressed transition_start = max_depth * transition_start_ratio transition_range = max_depth * transition_range_ratio compressed_depth = depth_values.clone() mask_far = depth_values > transition_start if mask is not None: mask_far = mask_far & mask if mask_far.sum() > 0: far_depths = depth_values[mask_far] normalized = (far_depths - transition_start) / transition_range compressed_normalized = torch.sigmoid( normalized * 2 - 1) * 0.5 + 0.5 compressed_far = transition_start + \ compressed_normalized * (max_depth - transition_start) compressed_depth[mask_far] = compressed_far if verbose: print( f"\t Applied smooth compression to {mask_far.sum()} pixels beyond {transition_start:.2f}") elif verbose: print(f"\t No compression needed, all depths within reasonable range") return compressed_depth def compress_background_depth(self, bg_depth_distance: torch.Tensor, layered_world_depth: List[Dict], bg_mask: torch.Tensor, verbose: bool = False) -> torch.Tensor: r""" Adaptive compression of background depth values Args: bg_depth_distance: Tensor of background depth distances. layered_world_depth: List of dictionaries containing depth information for each layer. bg_mask: Tensor or numpy array representing the mask for background depth. verbose: Whether to print detailed information about the compression process. Returns: Compressed background depth values as a tensor. """ if verbose: print(f"\t - Applying adaptive depth compression...") # Process mask if not isinstance(bg_mask, torch.Tensor): bg_mask = torch.from_numpy(bg_mask).to(bg_depth_distance.device) mask_bool = bg_mask.bool() masked_depths = bg_depth_distance[mask_bool] if masked_depths.numel() == 0: if verbose: print(f"\t No valid depths in mask region, skipping compression") return bg_depth_distance # 1. Collect prospect depth information fg_depths = self._collect_foreground_depths(layered_world_depth) # 2. Calculate prospect depth statistics if fg_depths: all_fg_depths = torch.cat(fg_depths) fg_max = torch.quantile(all_fg_depths, torch.tensor( 0.99, device=all_fg_depths.device)) if verbose: print( f"\t Foreground depth stats - 99th percentile: {fg_max:.2f}") else: fg_max = torch.quantile(masked_depths, torch.tensor( 0.5, device=masked_depths.device)) if verbose: print(f"\t No foreground found, using background stats for reference") # 3. Analyze the depth distribution of the background depth_stats = self._analyze_depth_distribution(masked_depths) if verbose: print( f"\t Background depth stats - mean: {depth_stats['mean']:.2f}, \ std: {depth_stats['std']:.2f}, CV: {depth_stats['cv']:.3f}") # 4. Determine compression strategy strategy, compression_quantile = self._determine_compression_strategy( depth_stats['cv']) max_depth = torch.quantile(masked_depths, torch.tensor( compression_quantile, device=masked_depths.device)) if verbose: print( f"\t {strategy.capitalize()} compression strategy \ (CV={depth_stats['cv']:.3f}), quantile={compression_quantile}") # 5. Pixel level depth constraint if fg_depths: fg_max_depth_pixelwise = self._get_pixelwise_foreground_max_depth( layered_world_depth, bg_depth_distance.shape, bg_depth_distance.device) required_min_bg_depth = fg_max_depth_pixelwise * self.fg_bg_depth_margin pixelwise_violations = ( bg_depth_distance < required_min_bg_depth) & mask_bool if pixelwise_violations.sum() > 0: violation_ratio = pixelwise_violations.float().sum() / mask_bool.float().sum() violated_required_depths = required_min_bg_depth[pixelwise_violations] pixelwise_min_depth = torch.quantile(violated_required_depths, torch.tensor( 0.99, device=violated_required_depths.device)) max_depth = torch.max(max_depth, pixelwise_min_depth) if verbose: print( f"\t Pixelwise constraint violation: {violation_ratio:.1%}, \ adjusted max depth to {max_depth:.2f}") elif verbose: print(f"\t Pixelwise depth constraints satisfied") # 6. Global statistical constraints if fg_depths: min_bg_depth = fg_max * self.fg_bg_depth_margin max_depth = torch.max(max_depth, min_bg_depth) if verbose: print(f"\t Final max depth: {max_depth:.2f}") # 6.5. Depth threshold check: If max_depth is less than the threshold, skip compression if max_depth < self.min_compression_depth: if verbose: print( f"\t Max depth {max_depth:.2f} is below threshold \ {self.min_compression_depth:.2f}, skipping compression") return bg_depth_distance # 7. Application compression compressed_depth = self._smooth_compression( bg_depth_distance, max_depth, mask_bool, 0.9, 0.2, verbose) # 8. Hard truncation of extreme outliers final_max = max_depth * 1.2 outliers = (compressed_depth > final_max) & mask_bool if outliers.sum() > 0: compressed_depth[outliers] = final_max # 9. statistic compression_ratio = ((bg_depth_distance > max_depth) & mask_bool).float().sum() / mask_bool.float().sum() if verbose: print( f"\t Compression summary - max depth: \ {max_depth:.2f}, affected: {compression_ratio:.1%}") return compressed_depth def compress_foreground_depth( self, fg_depth_distance: torch.Tensor, fg_mask: torch.Tensor, verbose: bool = False, conservative_ratio: float = 0.99, iqr_scale: float = 2 ) -> torch.Tensor: r""" Conservatively compress outliers for foreground depth Args: fg_depth_distance: Tensor of foreground depth distances. fg_mask: Tensor or numpy array representing the mask for foreground depth. verbose: Whether to print detailed information about the compression process. conservative_ratio: Ratio to use for conservative compression. iqr_scale: Scale factor for IQR-based upper bound. Returns: Compressed foreground depth values as a tensor. """ if verbose: print(f"\t - Applying conservative foreground depth compression...") # Process mask if not isinstance(fg_mask, torch.Tensor): fg_mask = torch.from_numpy(fg_mask).to(fg_depth_distance.device) mask_bool = fg_mask.bool() masked_depths = fg_depth_distance[mask_bool] if masked_depths.numel() == 0: if verbose: print(f"\t No valid depths in mask region, skipping compression") return fg_depth_distance # Calculate statistical information depth_mean, depth_std = torch.mean( masked_depths), torch.std(masked_depths) # Determine the upper bound using IQR and quantile methods q25, q75 = torch.quantile(masked_depths, torch.tensor( [0.25, 0.75], device=masked_depths.device)) iqr = q75 - q25 upper_bound = q75 + iqr_scale * iqr conservative_max = torch.quantile(masked_depths, torch.tensor( conservative_ratio, device=masked_depths.device)) final_max = torch.max(upper_bound, conservative_max) # Statistical Outliers outliers = (fg_depth_distance > final_max) & mask_bool outlier_count = outliers.sum().item() if verbose: print( f"\t Depth stats - mean: {depth_mean:.2f}, std: {depth_std:.2f}") print( f"\t IQR bounds - Q25: {q25:.2f}, Q75: {q75:.2f}, upper: {upper_bound:.2f}") print( f"\t Conservative max: {conservative_max:.2f}, final max: {final_max:.2f}") print( f"\t Outliers: {outlier_count} ({(outlier_count/masked_depths.numel()*100):.2f}%)") # Depth threshold check: If final_max is less than the threshold, skip compression if final_max < self.min_compression_depth: if verbose: print( f"\t Final max depth {final_max:.2f} is below threshold \ {self.min_compression_depth:.2f}, skipping compression") return fg_depth_distance # Apply compression if outlier_count > 0: compressed_depth = self._smooth_compression( fg_depth_distance, final_max, mask_bool, 0.99, 0.1, verbose) else: compressed_depth = fg_depth_distance.clone() return compressed_depth def create_adaptive_depth_compressor( scene_type: str = "auto", enable_smooth_compression: bool = True, outlier_removal_method: str = "iqr", min_compression_depth: float = 6.0, # Minimum compression depth threshold ) -> AdaptiveDepthCompressor: r""" Create adaptive depth compressors suitable for different scene types Args: scene_type: Scenario Type ("indoor", "outdoor", "mixed", "auto") enable_smooth_compression: enable smooth compression or not outlier_removal_method: Outlier removal method ("iqr", "quantile", "none") """ common_params = { "enable_smooth_compression": enable_smooth_compression, "outlier_removal_method": outlier_removal_method, "min_compression_depth": min_compression_depth, } if scene_type == "indoor": # Indoor scene: Depth variation is relatively small, conservative compression is used return AdaptiveDepthCompressor( cv_thresholds=(0.2, 0.6), compression_quantiles=(1.0, 0.975, 0.95), fg_bg_depth_margin=1.05, **common_params ) elif scene_type == "outdoor": # Outdoor scenes: There may be sky, distant mountains, etc., using more aggressive compression return AdaptiveDepthCompressor( cv_thresholds=(0.4, 1.0), compression_quantiles=(0.98, 0.955, 0.93), fg_bg_depth_margin=1.15, **common_params ) elif scene_type == "mixed": # Mixed Scene: Balanced Settings return AdaptiveDepthCompressor( cv_thresholds=(0.3, 0.8), compression_quantiles=(0.99, 0.97, 0.95), fg_bg_depth_margin=1.1, **common_params ) else: # auto # Automatic mode: Use default settings return AdaptiveDepthCompressor(**common_params)