Spaces:

juanmackie
/

2DTo3DSpatialPhotoConverter

Running

File size: 12,510 Bytes

import gradio as gr
import torch
import numpy as np
import cv2
from PIL import Image
import tempfile # For creating temporary video files
import os # Import the 'os' module
import accelerate # Import accelerate for better memory management (recommended)

# Marigold specific imports
from diffusers import MarigoldDepthPipeline, DDIMScheduler
from huggingface_hub import login # For Hugging Face Hub login if needed

# --- Marigold Model Setup ---
CHECKPOINT = "prs-eth/marigold-depth-v1-1"

# Check for HF_TOKEN_LOGIN environment variable for private models or higher rate limits
if "HF_TOKEN_LOGIN" in os.environ:
    login(token=os.environ["HF_TOKEN_LOGIN"])

device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
# Use bfloat16 for CUDA if available for performance, else float32
dtype = torch.bfloat16 if torch.cuda.is_available() else torch.float32

# Load the Marigold pipeline
try:
    pipe = MarigoldDepthPipeline.from_pretrained(CHECKPOINT)
    pipe.scheduler = DDIMScheduler.from_config(pipe.scheduler.config, timestep_spacing="trailing")
    pipe = pipe.to(device=device, dtype=dtype)
    
    # Enable xformers for memory-efficient attention ONLY IF CUDA is available
    if torch.cuda.is_available():
        try:
            import xformers
            pipe.enable_xformers_memory_efficient_attention()
            print("xFormers enabled for Marigold pipeline.")
        except ImportError:
            print("xFormers not found, running without memory-efficient attention (on GPU).")
    else:
        print("Running on CPU or MPS. xFormers memory-efficient attention is not applicable.")

    print(f"MarigoldDepthPipeline loaded successfully from {CHECKPOINT} on {device}.")
except Exception as e:
    print(f"Error loading MarigoldDepthPipeline: {e}")
    pipe = None # Set pipe to None to gracefully handle if it couldn't be loaded

# --- Default Marigold Parameters (from their demo) ---
DEFAULT_MARIGOLD_ENSEMBLE_SIZE = 1
DEFAULT_MARIGOLD_DENOISE_STEPS = 4
DEFAULT_MARIGOLD_PROCESSING_RES = 768 # Recommended resolution for Marigold

def process_image(image, max_disparity_ratio, inpaint_radius, ensemble_size, denoise_steps, processing_res):
    """
    Convert a 2D photo to a stereoscopic 3D image pair using Marigold for depth estimation
    and DIBR, with adjustable parameters.
    """
    if pipe is None:
        print("Error: Marigold model not loaded. Cannot process image.")
        return Image.new('RGB', (200, 200), color = 'red')

    # Convert PIL image to numpy array
    image_np = np.array(image)
    height, width = image_np.shape[:2]

    # Step 1: Estimate the depth map using Marigold
    try:
        # Marigold's pipeline directly takes a PIL Image.
        # Use a fixed seed for reproducibility if desired, otherwise remove 'generator'.
        generator = torch.Generator(device=device).manual_seed(2024) 
        marigold_output = pipe(
            image, # Pass PIL Image directly
            ensemble_size=ensemble_size,
            num_inference_steps=denoise_steps,
            processing_resolution=processing_res,
            batch_size=1 if processing_res == 0 else 2, # Batch size recommended by Marigold for resolutions
            generator=generator,
        ).prediction # This is the predicted depth map as a torch.Tensor

        # Move to CPU and convert to NumPy array
        depth_map = marigold_output.squeeze().cpu().numpy()

    except Exception as e:
        print(f"Error during Marigold depth estimation: {e}")
        # Return an orange image to indicate a depth estimation specific error
        return Image.new('RGB', (200, 200), color = 'orange')

    # Normalize the depth map to [0,1]
    if depth_map.max() - depth_map.min() > 0:
        depth_map = (depth_map - depth_map.min()) / (depth_map.max() - depth_map.min())
    else:
        depth_map = np.zeros_like(depth_map) # Handle flat depth map

    # Smooth the depth map to reduce noise for DIBR
    depth_map = cv2.GaussianBlur(depth_map, (5, 5), 0)

    # Step 2: Calculate the disparity map (inversely proportional to depth)
    max_disparity_pixels = int(max_disparity_ratio * width)
    disparity_map = max_disparity_pixels * (1 - depth_map)

    # Step 3: Initialize left and right images and masks for DIBR
    left_image = np.zeros_like(image_np)
    right_image = np.zeros_like(image_np)
    left_mask = np.ones((height, width), dtype=bool)
    right_mask = np.ones((height, width), dtype=bool)

    # Step 4: Perform pixel shifting (forward warping)
    for y in range(height):
        for x in range(width):
            disparity = int(disparity_map[y, x])
            
            new_x_left = x + disparity
            new_x_right = x - disparity
            
            if 0 <= new_x_left < width:
                left_image[y, new_x_left] = image_np[y, x]
                left_mask[y, new_x_left] = False
            
            if 0 <= new_x_right < width:
                right_image[y, new_x_right] = image_np[y, x]
                right_mask[y, new_x_right] = False

    # Convert masks to uint8 for OpenCV inpainting
    left_mask_uint8 = left_mask.astype(np.uint8) * 255
    right_mask_uint8 = right_mask.astype(np.uint8) * 255

    # Step 5: Apply inpainting to fill holes
    left_image_inpaint = cv2.inpaint(left_image, left_mask_uint8, inpaint_radius, cv2.INPAINT_TELEA)
    right_image_inpaint = cv2.inpaint(right_image, right_mask_uint8, inpaint_radius, cv2.INPAINT_TELEA)

    # Step 6: Combine into a side-by-side stereoscopic image
    stereo_image = np.hstack((left_image_inpaint, right_image_inpaint))

    return Image.fromarray(stereo_image)


def process_video(video_path, max_disparity_ratio, inpaint_radius, ensemble_size, denoise_steps, processing_res):
    """
    Convert a 2D video to a stereoscopic 3D video by processing each frame.
    """
    if pipe is None:
        print("Error: Marigold model not loaded. Cannot process video.")
        return None

    cap = cv2.VideoCapture(video_path)
    if not cap.isOpened():
        print(f"Error: Could not open video file at {video_path}")
        return None

    fps = cap.get(cv2.CAP_PROP_FPS)
    original_width = int(cap.get(cv2.CAP_PROP_FRAME_WIDTH))
    original_height = int(cap.get(cv2.CAP_PROP_FRAME_HEIGHT))

    output_width = original_width * 2
    output_height = original_height

    temp_output_video_path = tempfile.NamedTemporaryFile(suffix=".mp4", delete=False).name
    fourcc = cv2.VideoWriter_fourcc(*'mp4v') # Codec for MP4
    out = cv2.VideoWriter(temp_output_video_path, fourcc, fps, (output_width, output_height))

    if not out.isOpened():
        print(f"Error: Could not create video writer for {temp_output_video_path}")
        cap.release()
        return None

    frame_count = 0
    while True:
        ret, frame_bgr = cap.read() # frame_bgr is in BGR format
        if not ret:
            break

        frame_rgb_pil = Image.fromarray(cv2.cvtColor(frame_bgr, cv2.COLOR_BGR2RGB))
        
        # Process the single frame using the existing image processing logic
        processed_frame_pil = process_image(
            frame_rgb_pil, 
            max_disparity_ratio, 
            inpaint_radius,
            ensemble_size, # Pass Marigold params
            denoise_steps, # Pass Marigold params
            processing_res   # Pass Marigold params
        )
        
        if processed_frame_pil is None:
            print(f"Skipping frame {frame_count} due to processing error.")
            processed_frame_bgr = np.zeros((output_height, output_width, 3), dtype=np.uint8)
        else:
            processed_frame_np_rgb = np.array(processed_frame_pil)
            processed_frame_bgr = cv2.cvtColor(processed_frame_np_rgb, cv2.COLOR_RGB2BGR)

        out.write(processed_frame_bgr)
        frame_count += 1
        print(f"Processed frame {frame_count}...")

    cap.release()
    out.release()
    print(f"Finished processing {frame_count} frames. Output video saved to: {temp_output_video_path}")
    return temp_output_video_path

# Define the Gradio web interface layout and components
with gr.Blocks() as demo:
    gr.Markdown(
        """
        # 2D to Stereoscopic 3D Converter (with Marigold Depth)
        Upload a 2D photo or video to generate a stereoscopic 3D image or video pair for viewing on a Quest headset.
        The output is a side-by-side format: left half for the left eye, right half for the right eye.
        Adjust the sliders to fine-tune the 3D effect and Marigold's depth estimation.
        """
    )
    
    # Global sliders for DIBR and Marigold parameters
    with gr.Row():
        max_disparity_slider = gr.Slider(
            minimum=0.01,
            maximum=0.10,
            value=0.03, # A balanced default
            step=0.005,
            label="Max Disparity Ratio (controls 3D intensity)",
            info="Higher values mean a stronger 3D effect, but can cause more distortion."
        )
        inpaint_radius_slider = gr.Slider(
            minimum=1,
            maximum=20,
            value=5, # A common default for inpainting
            step=1,
            label="Inpainting Radius (controls hole filling)",
            info="Larger values fill holes more, but can blur details around shifted objects."
        )

    with gr.Accordion("Marigold Depth Estimation Settings", open=False):
        with gr.Row():
            ensemble_size_slider = gr.Slider(
                label="Marigold Ensemble size",
                minimum=1,
                maximum=10,
                step=1,
                value=DEFAULT_MARIGOLD_ENSEMBLE_SIZE,
                info="Higher values improve accuracy but increase processing time."
            )
            denoise_steps_slider = gr.Slider(
                label="Marigold Denoising steps",
                minimum=1,
                maximum=20,
                step=1,
                value=DEFAULT_MARIGOLD_DENOISE_STEPS,
                info="More steps improve quality but increase processing time."
            )
            processing_res_radio = gr.Radio(
                [
                    ("Native", 0),
                    ("Recommended (768)", 768),
                    ("High (1024)", 1024)
                ],
                label="Marigold Processing resolution",
                value=DEFAULT_MARIGOLD_PROCESSING_RES,
                info="Resolution for Marigold's internal processing. Native uses original image resolution. Higher resolutions are more accurate but slower."
            )
    
    with gr.Tabs():
        with gr.TabItem("Image Conversion"):
            with gr.Row():
                with gr.Column():
                    image_input = gr.Image(type="pil", label="Upload a 2D Photo")
                    image_process_button = gr.Button("Convert Image to 3D")
                with gr.Column():
                    image_output = gr.Image(type="pil", label="Stereoscopic 3D Image Output (Side-by-Side)")
            # Connect the image button to the image processing function
            image_process_button.click(
                fn=process_image,
                inputs=[
                    image_input, 
                    max_disparity_slider, 
                    inpaint_radius_slider,
                    ensemble_size_slider, 
                    denoise_steps_slider, 
                    processing_res_radio
                ],
                outputs=image_output
            )

        with gr.TabItem("Video Conversion"):
            with gr.Row():
                with gr.Column():
                    video_input = gr.Video(label="Upload a 2D MP4 Video")
                    video_process_button = gr.Button("Convert Video to 3D")
                with gr.Column():
                    video_output = gr.Video(label="Stereoscopic 3D Video Output (Side-by-Side)")
            # Connect the video button to the video processing function
            video_process_button.click(
                fn=process_video,
                inputs=[
                    video_input, 
                    max_disparity_slider, 
                    inpaint_radius_slider,
                    ensemble_size_slider, 
                    denoise_steps_slider, 
                    processing_res_radio
                ],
                outputs=video_output
            )

# This block is executed when the script is run directly (e.g., for local testing).
# Hugging Face Spaces typically runs the app via its own internal mechanisms.
if __name__ == '__main__':
    demo.launch()