|
import numpy as np |
|
import torch |
|
import cv2 |
|
from PIL import Image |
|
from transformers import pipeline |
|
import gradio as gr |
|
|
|
|
|
device = torch.device("cuda" if torch.cuda.is_available() else "cpu") |
|
device_index = 0 if torch.cuda.is_available() else -1 |
|
|
|
|
|
|
|
midas = torch.hub.load("intel-isl/MiDaS", "DPT_Large") |
|
midas.to(device).eval() |
|
midas_transforms = torch.hub.load("intel-isl/MiDaS", "transforms") |
|
transform = midas_transforms.dpt_transform |
|
|
|
|
|
segmenter = pipeline( |
|
"image-segmentation", |
|
model="nvidia/segformer-b0-finetuned-ade-512-512", |
|
device=device_index, |
|
torch_dtype=torch.float16 if device.type == "cuda" else torch.float32 |
|
) |
|
|
|
|
|
def resize_image(img: Image.Image, max_size: int = 512) -> Image.Image: |
|
width, height = img.size |
|
if max(width, height) > max_size: |
|
ratio = max_size / max(width, height) |
|
new_size = (int(width * ratio), int(height * ratio)) |
|
return img.resize(new_size, Image.LANCZOS) |
|
return img |
|
|
|
|
|
def predict_depth(image: Image.Image) -> Image.Image: |
|
|
|
img = image.convert('RGB') if not isinstance(image, Image.Image) else image |
|
img_np = np.array(img) |
|
|
|
|
|
input_tensor = transform(img_np).to(device) |
|
input_batch = input_tensor.unsqueeze(0) if input_tensor.ndim == 3 else input_tensor |
|
|
|
|
|
with torch.no_grad(): |
|
prediction = midas(input_batch) |
|
prediction = torch.nn.functional.interpolate( |
|
prediction.unsqueeze(1), |
|
size=img_np.shape[:2], |
|
mode="bicubic", |
|
align_corners=False |
|
).squeeze() |
|
|
|
|
|
depth_map = prediction.cpu().numpy() |
|
depth_map = (depth_map - depth_map.min()) / (depth_map.max() - depth_map.min()) |
|
depth_map = (depth_map * 255).astype(np.uint8) |
|
return Image.fromarray(depth_map) |
|
|
|
|
|
def segment_image(img: Image.Image) -> Image.Image: |
|
img = img.convert('RGB') |
|
img_resized = resize_image(img) |
|
results = segmenter(img_resized) |
|
|
|
overlay = np.array(img_resized, dtype=np.uint8) |
|
for res in results: |
|
mask = np.array(res["mask"], dtype=bool) |
|
color = np.random.randint(50, 255, 3, dtype=np.uint8) |
|
overlay[mask] = (overlay[mask] * 0.6 + color * 0.4).astype(np.uint8) |
|
|
|
return Image.fromarray(overlay) |
|
|
|
|
|
def predict_fn(input_img: Image.Image) -> Image.Image: |
|
|
|
depth_img = predict_depth(input_img) |
|
|
|
seg_img = segment_image(depth_img) |
|
return seg_img |
|
|
|
iface = gr.Interface( |
|
fn=predict_fn, |
|
inputs=gr.Image(type="pil", label="Upload Image"), |
|
outputs=gr.Image(type="pil", label="Segmented Depth Overlay"), |
|
title="Depth-then-Segmentation Pipeline", |
|
description="Upload an image. First computes a depth map via MiDaS, then applies SegFormer segmentation on the depth map." |
|
) |
|
|
|
if __name__ == "__main__": |
|
iface.launch() |