# %%writefile mv_utils_zs.py """ Author: yangyangyang127 Github: https://github.com/yangyangyang127 Repo: https://github.com/yangyangyang127/PointCLIP_V2 Path: https://github.com/yangyangyang127/PointCLIP_V2/blob/main/zeroshot_cls/trainers/mv_utils_zs.py#L135 """ import numpy as np import torch import torch.nn as nn from torch_scatter import scatter TRANS = -1.5 # realistic projection parameters params = { "maxpoolz": 1, "maxpoolxy": 7, "maxpoolpadz": 0, "maxpoolpadxy": 2, "convz": 1, "convxy": 3, "convsigmaxy": 3, "convsigmaz": 1, "convpadz": 0, "convpadxy": 1, "imgbias": 0.0, "depth_bias": 0.2, "obj_ratio": 0.8, "bg_clr": 0.0, "resolution": 122, "depth": 8, # default = 8 "grid_height": 64, "grid_width": 64, } class Grid2Image(nn.Module): """A pytorch implementation to turn 3D grid to 2D image. Maxpool: densifying the grid Convolution: smoothing via Gaussian Maximize: squeezing the depth channel """ def __init__(self): super().__init__() torch.backends.cudnn.benchmark = False self.maxpool = nn.MaxPool3d( (params["maxpoolz"], params["maxpoolxy"], params["maxpoolxy"]), stride=1, padding=( params["maxpoolpadz"], params["maxpoolpadxy"], params["maxpoolpadxy"], ), ) self.conv = torch.nn.Conv3d( 1, 1, kernel_size=(params["convz"], params["convxy"], params["convxy"]), stride=1, padding=(params["convpadz"], params["convpadxy"], params["convpadxy"]), bias=True, ) kn3d = get3DGaussianKernel( params["convxy"], params["convz"], sigma=params["convsigmaxy"], zsigma=params["convsigmaz"], ) self.conv.weight.data = torch.Tensor(kn3d).repeat(1, 1, 1, 1, 1) self.conv.bias.data.fill_(0) # type: ignore def forward(self, x): x = self.maxpool(x.unsqueeze(1)) x = self.conv(x) img = torch.max(x, dim=2)[0] img = img / torch.max(torch.max(img, dim=-1)[0], dim=-1)[0][:, :, None, None] img = 1 - img img = img.repeat(1, 3, 1, 1) return img def euler2mat(angle): """Convert euler angles to rotation matrix. :param angle: [3] or [b, 3] :return rotmat: [3] or [b, 3, 3] source https://github.com/ClementPinard/SfmLearner-Pytorch/blob/master/inverse_warp.py """ if len(angle.size()) == 1: x, y, z = angle[0], angle[1], angle[2] _dim = 0 _view = [3, 3] elif len(angle.size()) == 2: b, _ = angle.size() x, y, z = angle[:, 0], angle[:, 1], angle[:, 2] _dim = 1 _view = [b, 3, 3] else: assert False cosz = torch.cos(z) sinz = torch.sin(z) # zero = torch.zeros([b], requires_grad=False, device=angle.device)[0] # one = torch.ones([b], requires_grad=False, device=angle.device)[0] zero = z.detach() * 0 one = zero.detach() + 1 zmat = torch.stack( [cosz, -sinz, zero, sinz, cosz, zero, zero, zero, one], dim=_dim ).reshape(_view) cosy = torch.cos(y) siny = torch.sin(y) ymat = torch.stack( [cosy, zero, siny, zero, one, zero, -siny, zero, cosy], dim=_dim ).reshape(_view) cosx = torch.cos(x) sinx = torch.sin(x) xmat = torch.stack( [one, zero, zero, zero, cosx, -sinx, zero, sinx, cosx], dim=_dim ).reshape(_view) rot_mat = xmat @ ymat @ zmat # print(rot_mat) return rot_mat def points_to_2d_grid( points, grid_h=params["grid_height"], grid_w=params["grid_width"] ): """ Converts a point cloud into a 2D grid based on X, Y coordinates. Points are projected onto a plane and quantized into grid cells. Args: points (torch.tensor): Tensor containing points, shape [B, P, 3] (B: batch size, P: number of points, 3: x, y, z coordinates) grid_h (int): Height of the output 2D grid. grid_w (int): Width of the output 2D grid. Returns: grid (torch.tensor): 2D grid representing the occupancy of points, shape [B, grid_h, grid_w]. Value 1.0 at cell (y, x) if at least one point falls into it, otherwise the background value (params["bg_clr"]). """ batch, pnum, _ = points.shape device = points.device # --- Step 1: Normalize point coordinates --- # Find min/max for each point cloud in the batch (considering only X, Y for better 2D normalization) pmax_xy = points[:, :, :2].max(dim=1)[0] pmin_xy = points[:, :, :2].min(dim=1)[0] # Compute the center and range based on X, Y pcent_xy = (pmax_xy + pmin_xy) / 2 pcent_xy = pcent_xy[:, None, :] # Add P dimension for broadcasting [B, 1, 2] # Use the larger range between X and Y to maintain aspect ratio prange_xy = (pmax_xy - pmin_xy).max(dim=-1)[0][:, None, None] # [B, 1, 1] # Add a small epsilon to avoid division by zero if all points overlap epsilon = 1e-8 # Normalize X, Y into the range [-1, 1] based on the X, Y range points_normalized_xy = (points[:, :, :2] - pcent_xy) / (prange_xy + epsilon) * 2.0 # Adjust the scale according to obj_ratio (if needed) points_normalized_xy = points_normalized_xy * params["obj_ratio"] # --- Step 2: Map normalized coordinates to 2D grid indices --- # Map X from the range [-obj_ratio, obj_ratio] -> [0, grid_w] # Map Y from the range [-obj_ratio, obj_ratio] -> [0, grid_h] # General formula: (normalized_coord + scale) / (2 * scale) * grid_dim _x = ( (points_normalized_xy[:, :, 0] + params["obj_ratio"]) / (2 * params["obj_ratio"]) * grid_w ) _y = ( (points_normalized_xy[:, :, 1] + params["obj_ratio"]) / (2 * params["obj_ratio"]) * grid_h ) # Round down to determine the grid cell indices _x = torch.floor(_x).long() _y = torch.floor(_y).long() # --- Step 3: Clamp indices to valid grid range --- # Clip _x to [0, grid_w - 1] # Clip _y to [0, grid_h - 1] _x = torch.clip(_x, 0, grid_w - 1) _y = torch.clip(_y, 0, grid_h - 1) # --- Step 4: Create a 2D grid and mark occupied cells --- # Initialize the 2D grid with the background value grid = torch.full( (batch, grid_h, grid_w), params["bg_clr"], dtype=torch.float32, device=device ) # Create batch indices corresponding to each point batch_indices = torch.arange(batch, device=device).view(-1, 1).repeat(1, pnum) # Flatten indices for easier assignment batch_idx_flat = batch_indices.view(-1) y_idx_flat = _y.view(-1) x_idx_flat = _x.view(-1) # Assign a value of 1.0 to grid cells (y, x) corresponding to point positions # If multiple points fall into the same cell, the cell still has a value of 1.0 grid[batch_idx_flat, y_idx_flat, x_idx_flat] = 1.0 return grid def points2grid(points, resolution=params["resolution"], depth=params["depth"]): """Quantize each point cloud to a 3D grid. Args: points (torch.tensor): of size [B, _, 3] Returns: grid (torch.tensor): of size [B * self.num_views, depth, resolution, resolution] """ batch, pnum, _ = points.shape pmax, pmin = points.max(dim=1)[0], points.min(dim=1)[0] pcent = (pmax + pmin) / 2 pcent = pcent[:, None, :] prange = (pmax - pmin).max(dim=-1)[0][:, None, None] points = (points - pcent) / prange * 2.0 points[:, :, :2] = points[:, :, :2] * params["obj_ratio"] depth_bias = params["depth_bias"] _x = (points[:, :, 0] + 1) / 2 * resolution _y = (points[:, :, 1] + 1) / 2 * resolution _z = ((points[:, :, 2] + 1) / 2 + depth_bias) / (1 + depth_bias) * (depth - 2) _x.ceil_() _y.ceil_() z_int = _z.ceil() _x = torch.clip(_x, 1, resolution - 2) _y = torch.clip(_y, 1, resolution - 2) _z = torch.clip(_z, 1, depth - 2) coordinates = z_int * resolution * resolution + _y * resolution + _x grid = ( torch.ones([batch, depth, resolution, resolution], device=points.device).view( batch, -1 ) * params["bg_clr"] ) grid = scatter(_z, coordinates.long(), dim=1, out=grid, reduce="max") grid = grid.reshape((batch, depth, resolution, resolution)).permute((0, 1, 3, 2)) return grid def points_to_occupancy_grid( points, resolution=params["resolution"], depth=params["depth"] ): """Quantize each point cloud into a 3D occupancy grid.""" batch, pnum, _ = points.shape device = points.device # Get device to create new tensors # --- Normalization and coordinate mapping remain unchanged --- pmax, pmin = points.max(dim=1)[0], points.min(dim=1)[0] pcent = (pmax + pmin) / 2 pcent = pcent[:, None, :] prange = (pmax - pmin).max(dim=-1)[0][ :, None, None ] + 1e-8 # Add epsilon to avoid division by zero points_norm = (points - pcent) / prange * 2.0 points_norm[:, :, :2] = points_norm[:, :, :2] * params["obj_ratio"] depth_bias = params["depth_bias"] _x = (points_norm[:, :, 0] + 1) / 2 * resolution _y = (points_norm[:, :, 1] + 1) / 2 * resolution _z = ((points_norm[:, :, 2] + 1) / 2 + depth_bias) / (1 + depth_bias) * (depth - 2) _x.ceil_() _y.ceil_() z_int = _z.ceil() _x = torch.clip(_x, 1, resolution - 2) _y = torch.clip(_y, 1, resolution - 2) # z_int should also be clipped if used as coordinate indices z_int = torch.clip(z_int, 1, depth - 2) # --- Compute flattened coordinates --- coordinates = z_int * resolution * resolution + _y * resolution + _x coordinates = coordinates.long() # Convert to Long # --- Create Grid and Scatter --- # Initialize the grid with the background value (e.g., 0) # Use torch.zeros instead of torch.ones and multiply by bg_clr bg_clr_value = params.get("bg_clr", 0.0) # Get bg_clr, default is 0 grid = torch.full( (batch, depth * resolution * resolution), bg_clr_value, dtype=torch.float32, # Or appropriate dtype device=device, ) # Create a source tensor (src) containing a value of 1.0 for each point # The size must match the flattened coordinates: [B * pnum] values_to_scatter = torch.ones(batch * pnum, dtype=torch.float32, device=device) # Scatter the value 1.0 into the grid at the positions `coordinates` # Use reduce="max". If a cell has at least one point, max(1.0, bg_clr) will be 1.0 (if bg_clr <= 1) # To ensure the value is always 1 regardless of bg_clr, use a different reduce or post-process after scatter. # A safer choice if bg_clr can be > 1 is to initialize the grid with 0 and use reduce='max'/'mean' # Or initialize with bg_clr and process after scatter. if bg_clr_value != 0.0: print( "Warning: bg_clr is not 0.0, occupancy grid might not be strictly binary 0/1 with reduce='max'. Consider initializing grid with 0." ) grid = scatter( values_to_scatter, coordinates.view(-1), # Flatten coordinates to [B*pnum] dim=0, # Scatter along dimension 0 of the flattened grid [B*D*R*R] out=grid.view(-1), # Flatten grid to [B*D*R*R] for scatter along dim 0 reduce="max", ) # If a point exists -> cell value is 1, otherwise bg_clr # --- Reshape and Permute remain unchanged --- # Reshape the grid back to the correct 3D + batch size # Note: scatter into a flattened grid requires careful reshaping grid = grid.view(batch, depth, resolution, resolution) # Reshape back grid = grid.permute((0, 1, 3, 2)) return grid class Realistic_Projection: """For creating images from PC based on the view information.""" def __init__(self): _views = np.asarray([ [[1 * np.pi / 4, 0, np.pi / 2], [-0.5, -0.5, TRANS]], [[3 * np.pi / 4, 0, np.pi / 2], [-0.5, -0.5, TRANS]], [[5 * np.pi / 4, 0, np.pi / 2], [-0.5, -0.5, TRANS]], [[7 * np.pi / 4, 0, np.pi / 2], [-0.5, -0.5, TRANS]], [[0 * np.pi / 2, 0, np.pi / 2], [-0.5, -0.5, TRANS]], [[1 * np.pi / 2, 0, np.pi / 2], [-0.5, -0.5, TRANS]], [[2 * np.pi / 2, 0, np.pi / 2], [-0.5, -0.5, TRANS]], [[3 * np.pi / 2, 0, np.pi / 2], [-0.5, -0.5, TRANS]], [[0, -np.pi / 2, np.pi / 2], [-0.5, -0.5, TRANS]], [[0, np.pi / 2, np.pi / 2], [-0.5, -0.5, TRANS]], ]) # adding some bias to the view angle to reveal more surface _views_bias = np.asarray([ [[0, np.pi / 9, 0], [-0.5, 0, TRANS]], [[0, np.pi / 9, 0], [-0.5, 0, TRANS]], [[0, np.pi / 9, 0], [-0.5, 0, TRANS]], [[0, np.pi / 9, 0], [-0.5, 0, TRANS]], [[0, np.pi / 9, 0], [-0.5, 0, TRANS]], [[0, np.pi / 9, 0], [-0.5, 0, TRANS]], [[0, np.pi / 9, 0], [-0.5, 0, TRANS]], [[0, np.pi / 9, 0], [-0.5, 0, TRANS]], [[0, np.pi / 15, 0], [-0.5, 0, TRANS]], [[0, np.pi / 15, 0], [-0.5, 0, TRANS]], ]) self.num_views = _views.shape[0] angle = torch.tensor(_views[:, 0, :]).float() # .cuda() self.rot_mat = euler2mat(angle).transpose(1, 2) angle2 = torch.tensor(_views_bias[:, 0, :]).float() # .cuda() self.rot_mat2 = euler2mat(angle2).transpose(1, 2) self.translation = torch.tensor(_views[:, 1, :]).float() # .cuda() self.translation = self.translation.unsqueeze(1) self.grid2image = Grid2Image() # .cuda() def get_img(self, points): b, _, _ = points.shape v = self.translation.shape[0] _points = self.point_transform( points=torch.repeat_interleave(points, v, dim=0), rot_mat=self.rot_mat.repeat(b, 1, 1), rot_mat2=self.rot_mat2.repeat(b, 1, 1), translation=self.translation.repeat(b, 1, 1), ) grid = points2grid( points=_points, resolution=params["resolution"], depth=params["depth"] ).squeeze() img = self.grid2image(grid) return img @staticmethod def point_transform(points, rot_mat, rot_mat2, translation): """ :param points: [batch, num_points, 3] :param rot_mat: [batch, 3] :param rot_mat2: [batch, 3] :param translation: [batch, 1, 3] :return: """ rot_mat = rot_mat.to(points.device) rot_mat2 = rot_mat2.to(points.device) translation = translation.to(points.device) points = torch.matmul(points, rot_mat) points = torch.matmul(points, rot_mat2) points = points - translation return points def get2DGaussianKernel(ksize, sigma=0): center = ksize // 2 xs = np.arange(ksize, dtype=np.float32) - center kernel1d = np.exp(-(xs**2) / (2 * sigma**2)) kernel = kernel1d[..., None] @ kernel1d[None, ...] kernel = torch.from_numpy(kernel) kernel = kernel / kernel.sum() return kernel # Without numpy # def get2DGaussianKernel(ksize, sigma): # xs = torch.linspace(-(ksize // 2), ksize // 2, steps=ksize) # kernel1d = torch.exp(-(xs ** 2) / (2 * sigma ** 2)) # kernel2d = torch.outer(kernel1d, kernel1d) # kernel2d /= kernel2d.sum() # return kernel2d def get3DGaussianKernel(ksize, depth, sigma=2, zsigma=2): kernel2d = get2DGaussianKernel(ksize, sigma) zs = np.arange(depth, dtype=np.float32) - depth // 2 zkernel = np.exp(-(zs**2) / (2 * zsigma**2)) kernel3d = np.repeat(kernel2d[None, :, :], depth, axis=0) * zkernel[:, None, None] kernel3d = kernel3d / torch.sum(kernel3d) return kernel3d