Spaces:

EgoHackZero
/

midas-depth-estimation

Running

EgoHackZero

try to add segmentation step

6225fda 24 days ago

3.17 kB

	import numpy as np
	import torch
	import cv2
	from PIL import Image
	from transformers import pipeline
	import gradio as gr

	# ===== Device Setup =====
	device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
	device_index = 0 if torch.cuda.is_available() else -1

	# ===== MiDaS Depth Estimation Setup =====
	# Load MiDaS model and transforms
	midas = torch.hub.load("intel-isl/MiDaS", "DPT_Large")
	midas.to(device).eval()
	midas_transforms = torch.hub.load("intel-isl/MiDaS", "transforms")
	transform = midas_transforms.dpt_transform

	# ===== Segmentation Setup =====
	segmenter = pipeline(
	"image-segmentation",
	model="nvidia/segformer-b0-finetuned-ade-512-512",
	device=device_index,
	torch_dtype=torch.float16 if device.type == "cuda" else torch.float32
	)

	# ===== Utility Functions =====
	def resize_image(img: Image.Image, max_size: int = 512) -> Image.Image:
	width, height = img.size
	if max(width, height) > max_size:
	ratio = max_size / max(width, height)
	new_size = (int(width * ratio), int(height * ratio))
	return img.resize(new_size, Image.LANCZOS)
	return img

	# ===== Depth Prediction =====
	def predict_depth(image: Image.Image) -> Image.Image:
	# Ensure input is PIL Image
	img = image.convert('RGB') if not isinstance(image, Image.Image) else image
	img_np = np.array(img)

	# Convert to the format expected by MiDaS
	input_tensor = transform(img_np).to(device)
	input_batch = input_tensor.unsqueeze(0) if input_tensor.ndim == 3 else input_tensor

	# Predict depth
	with torch.no_grad():
	prediction = midas(input_batch)
	prediction = torch.nn.functional.interpolate(
	prediction.unsqueeze(1),
	size=img_np.shape[:2],
	mode="bicubic",
	align_corners=False
	).squeeze()

	# Normalize to 0-255
	depth_map = prediction.cpu().numpy()
	depth_map = (depth_map - depth_map.min()) / (depth_map.max() - depth_map.min())
	depth_map = (depth_map * 255).astype(np.uint8)
	return Image.fromarray(depth_map)

	# ===== Segmentation =====
	def segment_image(img: Image.Image) -> Image.Image:
	img = img.convert('RGB')
	img_resized = resize_image(img)
	results = segmenter(img_resized)

	overlay = np.array(img_resized, dtype=np.uint8)
	for res in results:
	mask = np.array(res["mask"], dtype=bool)
	color = np.random.randint(50, 255, 3, dtype=np.uint8)
	overlay[mask] = (overlay[mask] * 0.6 + color * 0.4).astype(np.uint8)

	return Image.fromarray(overlay)

	# ===== Gradio App =====
	def predict_fn(input_img: Image.Image) -> Image.Image:
	# 1. Compute depth map
	depth_img = predict_depth(input_img)
	# 2. Segment the depth map
	seg_img = segment_image(depth_img)
	return seg_img

	iface = gr.Interface(
	fn=predict_fn,
	inputs=gr.Image(type="pil", label="Upload Image"),
	outputs=gr.Image(type="pil", label="Segmented Depth Overlay"),
	title="Depth-then-Segmentation Pipeline",
	description="Upload an image. First computes a depth map via MiDaS, then applies SegFormer segmentation on the depth map."
	)

	if __name__ == "__main__":
	iface.launch()