PartCrafter

Running on Zero

File size: 13,042 Bytes

bef5729

from src.utils.typing_utils import *

import os
import numpy as np
from PIL import Image
import trimesh
from trimesh.transformations import rotation_matrix
import pyrender
from diffusers.utils import export_to_video
from diffusers.utils.loading_utils import load_video
import torch
from torchvision.utils import make_grid

os.environ['PYOPENGL_PLATFORM'] = 'egl'

def render(
    scene: pyrender.Scene,
    renderer: pyrender.Renderer,
    camera: pyrender.Camera,
    pose: np.ndarray,
    light: Optional[pyrender.Light] = None,
    normalize_depth: bool = False,
    flags: int = pyrender.constants.RenderFlags.NONE,
    return_type: Literal['pil', 'ndarray'] = 'pil'
) -> Union[Tuple[np.ndarray, np.ndarray], Tuple[Image.Image, Image.Image]]:
    camera_node = scene.add(camera, pose=pose)
    if light is not None:
        light_node = scene.add(light, pose=pose)
    image, depth = renderer.render(
        scene, 
        flags=flags
    )
    scene.remove_node(camera_node)
    if light is not None:
        scene.remove_node(light_node)
    if normalize_depth or return_type == 'pil':
        depth = (depth - depth.min()) / (depth.max() - depth.min()) * 255.0
    if return_type == 'pil':
        image = Image.fromarray(image)
        depth = Image.fromarray(depth.astype(np.uint8))
    return image, depth

def rotation_matrix_from_vectors(vec1, vec2):
    a, b = vec1 / np.linalg.norm(vec1), vec2 / np.linalg.norm(vec2)
    v = np.cross(a, b)
    c = np.dot(a, b)
    s = np.linalg.norm(v)
    if s == 0:
        return np.eye(3) if c > 0 else -np.eye(3)
    kmat = np.array([
        [0, -v[2], v[1]],
        [v[2], 0, -v[0]],
        [-v[1], v[0], 0]
    ])
    return np.eye(3) + kmat + kmat @ kmat * ((1 - c) / (s ** 2))

def create_circular_camera_positions(
    num_views: int,
    radius: float,
    axis: np.ndarray = np.array([0.0, 1.0, 0.0])
) -> List[np.ndarray]:
    # Create a list of positions for a circular camera trajectory
    # around the given axis with the given radius.
    positions = []
    axis = axis / np.linalg.norm(axis)
    for i in range(num_views):
        theta = 2 * np.pi * i / num_views
        position = np.array([
            np.sin(theta) * radius,
            0.0,
            np.cos(theta) * radius
        ])
        if not np.allclose(axis, np.array([0.0, 1.0, 0.0])):
            R = rotation_matrix_from_vectors(np.array([0.0, 1.0, 0.0]), axis)
            position = R @ position
        positions.append(position)
    return positions

def create_circular_camera_poses(
    num_views: int,
    radius: float,
    axis: np.ndarray = np.array([0.0, 1.0, 0.0])
) -> List[np.ndarray]:
    # Create a list of poses for a circular camera trajectory
    # around the given axis with the given radius.
    # The camera always looks at the origin.
    # The up vector is always [0, 1, 0].
    canonical_pose = np.array([
        [1.0, 0.0, 0.0, 0.0],
        [0.0, 1.0, 0.0, 0.0],
        [0.0, 0.0, 1.0, radius],
        [0.0, 0.0, 0.0, 1.0]
    ])
    poses = []
    for i in range(num_views):
        theta = 2 * np.pi * i / num_views
        R = rotation_matrix(
            angle=theta,
            direction=axis,
            point=[0, 0, 0]
        )
        pose = R @ canonical_pose
        poses.append(pose)
    return poses

def render_views_around_mesh(
    mesh: Union[trimesh.Trimesh, trimesh.Scene],
    num_views: int = 36,
    radius: float = 3.5,
    axis: np.ndarray = np.array([0.0, 1.0, 0.0]),
    image_size: tuple = (512, 512),
    fov: float = 40.0,
    light_intensity: Optional[float] = 5.0,
    znear: float = 0.1,
    zfar: float = 10.0, 
    normalize_depth: bool = False,
    flags: int = pyrender.constants.RenderFlags.NONE,
    return_depth: bool = False, 
    return_type: Literal['pil', 'ndarray'] = 'pil'
) -> Union[
        List[Image.Image], 
        List[np.ndarray], 
        Tuple[List[Image.Image], List[Image.Image]], 
        Tuple[List[np.ndarray], List[np.ndarray]]
    ]:
    
    if not isinstance(mesh, (trimesh.Trimesh, trimesh.Scene)):
        raise ValueError("mesh must be a trimesh.Trimesh or trimesh.Scene object")
    if isinstance(mesh, trimesh.Trimesh):
        mesh = trimesh.Scene(mesh)

    scene = pyrender.Scene.from_trimesh_scene(mesh)
    light = pyrender.DirectionalLight(
        color=np.ones(3), 
        intensity=light_intensity
    ) if light_intensity is not None else None
    camera = pyrender.PerspectiveCamera(
        yfov=np.deg2rad(fov),
        aspectRatio=image_size[0]/image_size[1],
        znear=znear,
        zfar=zfar
    )
    renderer = pyrender.OffscreenRenderer(*image_size)

    camera_poses = create_circular_camera_poses(
        num_views, 
        radius, 
        axis = axis
    )

    images, depths = [], []
    for pose in camera_poses:
        image, depth = render(
            scene, renderer, camera, pose, light, 
            normalize_depth=normalize_depth,
            flags=flags,
            return_type=return_type
        )
        images.append(image)
        depths.append(depth)

    renderer.delete()

    if return_depth:
        return images, depths
    return images

def render_normal_views_around_mesh(
    mesh: Union[trimesh.Trimesh, trimesh.Scene],
    num_views: int = 36,
    radius: float = 3.5,
    axis: np.ndarray = np.array([0.0, 1.0, 0.0]),
    image_size: tuple = (512, 512),
    fov: float = 40.0,
    light_intensity: Optional[float] = 5.0,
    znear: float = 0.1,
    zfar: float = 10.0,
    normalize_depth: bool = False,
    flags: int = pyrender.constants.RenderFlags.NONE,
    return_depth: bool = False, 
    return_type: Literal['pil', 'ndarray'] = 'pil'
) -> Union[
        List[Image.Image], 
        List[np.ndarray], 
        Tuple[List[Image.Image], List[Image.Image]], 
        Tuple[List[np.ndarray], List[np.ndarray]]
    ]:
    
    if not isinstance(mesh, (trimesh.Trimesh, trimesh.Scene)):
        raise ValueError("mesh must be a trimesh.Trimesh or trimesh.Scene object")
    if isinstance(mesh, trimesh.Scene):
        mesh = mesh.to_geometry()
    normals = mesh.vertex_normals
    colors = ((normals + 1.0) / 2.0 * 255).astype(np.uint8)
    mesh.visual = trimesh.visual.ColorVisuals(
        mesh=mesh,
        vertex_colors=colors
    )
    mesh = trimesh.Scene(mesh)
    return render_views_around_mesh(
        mesh, num_views, radius, axis, 
        image_size, fov, light_intensity, znear, zfar, 
        normalize_depth, flags,
        return_depth, return_type
    )

def create_camera_pose_on_sphere(
    azimuth: float = 0.0, # in degrees
    elevation: float = 0.0, # in degrees
    radius: float = 3.5,
) -> np.ndarray:
    # Create a camera pose for a given azimuth and elevation
    # with the given radius.
    # The camera always looks at the origin.
    # The up vector is always [0, 1, 0].
    canonical_pose = np.array([
        [1.0, 0.0, 0.0, 0.0],
        [0.0, 1.0, 0.0, 0.0],
        [0.0, 0.0, 1.0, radius],
        [0.0, 0.0, 0.0, 1.0]
    ])
    azimuth = np.deg2rad(azimuth)
    elevation = np.deg2rad(elevation)
    position = np.array([
        np.cos(elevation) * np.sin(azimuth),
        np.sin(elevation),
        np.cos(elevation) * np.cos(azimuth),
    ])
    R = np.eye(4)
    R[:3, :3] = rotation_matrix_from_vectors(
        np.array([0.0, 0.0, 1.0]), 
        position
    )
    pose = R @ canonical_pose
    return pose

def render_single_view(
    mesh: Union[trimesh.Trimesh, trimesh.Scene],
    azimuth: float = 0.0, # in degrees
    elevation: float = 0.0, # in degrees
    radius: float = 3.5,
    image_size: tuple = (512, 512),
    fov: float = 40.0,
    light_intensity: Optional[float] = 5.0,
    num_env_lights: int = 0, 
    znear: float = 0.1,
    zfar: float = 10.0,
    normalize_depth: bool = False,
    flags: int = pyrender.constants.RenderFlags.NONE,
    return_depth: bool = False, 
    return_type: Literal['pil', 'ndarray'] = 'pil'
) -> Union[
        Image.Image, 
        np.ndarray, 
        Tuple[Image.Image, Image.Image], 
        Tuple[np.ndarray, np.ndarray]
    ]:
    
    if not isinstance(mesh, (trimesh.Trimesh, trimesh.Scene)):
        raise ValueError("mesh must be a trimesh.Trimesh or trimesh.Scene object")
    if isinstance(mesh, trimesh.Trimesh):
        mesh = trimesh.Scene(mesh)

    scene = pyrender.Scene.from_trimesh_scene(mesh)
    light = pyrender.DirectionalLight(
        color=np.ones(3), 
        intensity=light_intensity
    ) if light_intensity is not None else None
    camera = pyrender.PerspectiveCamera(
        yfov=np.deg2rad(fov),
        aspectRatio=image_size[0]/image_size[1],
        znear=znear,
        zfar=zfar
    )
    renderer = pyrender.OffscreenRenderer(*image_size)

    camera_pose = create_camera_pose_on_sphere(
        azimuth,
        elevation,
        radius
    )

    if num_env_lights > 0:
        env_light_poses = create_circular_camera_poses(
            num_env_lights,
            radius,
            axis = np.array([0.0, 1.0, 0.0])
        )
        for pose in env_light_poses:
            scene.add(pyrender.DirectionalLight(
                color=np.ones(3),
                intensity=light_intensity
            ), pose=pose)
        # set light to None
        light = None

    image, depth = render(
        scene, renderer, camera, camera_pose, light,
        normalize_depth=normalize_depth,
        flags=flags,
        return_type=return_type
    )
    renderer.delete()

    if return_depth:
        return image, depth
    return image

def render_normal_single_view(
    mesh: Union[trimesh.Trimesh, trimesh.Scene],
    azimuth: float = 0.0, # in degrees
    elevation: float = 0.0, # in degrees
    radius: float = 3.5,
    image_size: tuple = (512, 512),
    fov: float = 40.0,
    light_intensity: Optional[float] = 5.0,
    znear: float = 0.1,
    zfar: float = 10.0,
    normalize_depth: bool = False,
    flags: int = pyrender.constants.RenderFlags.NONE,
    return_depth: bool = False,
    return_type: Literal['pil', 'ndarray'] = 'pil'
) -> Union[
        Image.Image,
        np.ndarray,
        Tuple[Image.Image, Image.Image],
        Tuple[np.ndarray, np.ndarray]
    ]:

    if not isinstance(mesh, (trimesh.Trimesh, trimesh.Scene)):
        raise ValueError("mesh must be a trimesh.Trimesh or trimesh.Scene object")
    if isinstance(mesh, trimesh.Scene):
        mesh = mesh.to_geometry()
    normals = mesh.vertex_normals
    colors = ((normals + 1.0) / 2.0 * 255).astype(np.uint8)
    mesh.visual = trimesh.visual.ColorVisuals(
        mesh=mesh,
        vertex_colors=colors
    )
    mesh = trimesh.Scene(mesh)
    return render_single_view(
        mesh, azimuth, elevation, radius, 
        image_size, fov, light_intensity, znear, zfar,
        normalize_depth, flags, 
        return_depth, return_type
    )

def export_renderings(
    images: List[Image.Image],
    export_path: str,
    fps: int = 36, 
    loop: int = 0
): 
    export_type = export_path.split('.')[-1]
    if export_type == 'mp4':
        export_to_video(
            images,
            export_path,
            fps=fps,
        )
    elif export_type == 'gif':
        duration = 1000 / fps
        images[0].save(
            export_path,
            save_all=True,
            append_images=images[1:],
            duration=duration,
            loop=loop
        )
    else:
        raise ValueError(f'Unknown export type: {export_type}')
    
def make_grid_for_images_or_videos(
    images_or_videos: Union[List[Image.Image], List[List[Image.Image]]],
    nrow: int = 4, 
    padding: int = 0, 
    pad_value: int = 0, 
    image_size: tuple = (512, 512),
    return_type: Literal['pil', 'ndarray'] = 'pil'
) -> Union[Image.Image, List[Image.Image], np.ndarray]:
    if isinstance(images_or_videos[0], Image.Image):
        images = [np.array(image.resize(image_size).convert('RGB')) for image in images_or_videos]
        images = np.stack(images, axis=0).transpose(0, 3, 1, 2) # [N, C, H, W]
        images = torch.from_numpy(images)
        image_grid = make_grid(
            images,
            nrow=nrow,
            padding=padding,
            pad_value=pad_value,
            normalize=False
        ) # [C, H', W']
        image_grid = image_grid.cpu().numpy()
        if return_type == 'pil':
            image_grid = Image.fromarray(image_grid.transpose(1, 2, 0))
        return image_grid
    elif isinstance(images_or_videos[0], list) and isinstance(images_or_videos[0][0], Image.Image):
        image_grids = []
        for i in range(len(images_or_videos[0])):
            images = [video[i] for video in images_or_videos]
            image_grid = make_grid_for_images_or_videos(
                images,
                nrow=nrow,
                padding=padding,
                return_type=return_type
            )
            image_grids.append(image_grid)
        if return_type == 'ndarray':
            image_grids = np.stack(image_grids, axis=0)
        return image_grids
    else:
        raise ValueError(f'Unknown input type: {type(images_or_videos[0])}')