Spaces:

Zitti123
/

size-anything

Sleeping

App Files Files Community

size-anything / goal2 /src /agent /models.py

Zitti123

🚀 Initial deployment: Smart Object Size Estimator with AI-powered depth estimation and SAM segmentation

1924502 about 1 month ago

raw

history blame contribute delete

4.03 kB

	import pathlib, urllib.request, torch, sys

	ROOT = pathlib.Path(__file__).resolve().parents[3]
	MODEL_DIR = ROOT / "models"
	DEPTH_REPO = MODEL_DIR / "Depth-Anything-V2"
	DEPTH_W = ROOT / "weights" / "depth_anything_v2" / "depth_anything_v2_vits.pth"
	SAM_REPO = MODEL_DIR / "segment-anything"
	SAM_W = ROOT / "weights" / "segment-anything" / "sam_vit_b_01ec64.pth"

	sys.path.insert(0, str(DEPTH_REPO))
	sys.path.insert(0, str(SAM_REPO))
	from depth_anything_v2.dpt import DepthAnythingV2
	from segment_anything import sam_model_registry, SamAutomaticMaskGenerator

	def _download(url: str, dest: pathlib.Path):
	dest.parent.mkdir(parents=True, exist_ok=True)
	if not dest.exists():
	urllib.request.urlretrieve(url, dest)

	def load_depth(device):
	_download("https://huggingface.co/Depth-Anything/Depth-Anything-V2-Small/resolve/main/depth_anything_v2_vits.pth", DEPTH_W)

	# Model configuration for ViT-S
	model_configs = {
	'vits': {'encoder': 'vits', 'features': 64, 'out_channels': [48, 96, 192, 384]},
	}

	net = DepthAnythingV2(**model_configs['vits'])
	state_dict = torch.load(DEPTH_W, map_location=device)
	net.load_state_dict(state_dict, strict=True)
	net = net.to(device).eval()
	return net

	def load_sam(device):
	_download("https://dl.fbaipublicfiles.com/segment_anything/sam_vit_b_01ec64.pth", SAM_W)
	sam = sam_model_registry["vit_b"](checkpoint=str(SAM_W)).to(device)
	return SamAutomaticMaskGenerator(sam)

	def predict_depth(net, img_rgb, device):
	from agent.io import pad_to_multiple_of_14, safe_resize
	import torch, numpy as np
	img = pad_to_multiple_of_14(safe_resize(img_rgb))
	ten = torch.from_numpy(img).permute(2,0,1).unsqueeze(0).float() / 255
	with torch.no_grad():
	d = net(ten.to(device)).squeeze().cpu().numpy()
	return d

	def generate_masks(mask_gen, img_rgb, use_goal1_approach=True):
	from agent.io import safe_resize, pad_to_multiple_of_14

	if use_goal1_approach:
	# Goal1 approach: Just resize to 1024px, NO padding (this works!)
	img_processed = safe_resize(img_rgb, long_side=1024)
	else:
	# Goal2 approach: Use same preprocessing as depth (may misalign but consistent)
	img_processed = pad_to_multiple_of_14(safe_resize(img_rgb))

	return mask_gen.generate(img_processed)

	def predict_depth_and_masks(depth_net, mask_gen, img_rgb, device, approach="goal1_sam"):
	"""
	Generate depth and masks with different alignment strategies.

	Args:
	approach:
	- "goal1_sam": Use Goal1's SAM approach (1024px, no padding) - better SAM results
	- "aligned": Use identical preprocessing for both - perfect alignment
	"""
	from agent.io import pad_to_multiple_of_14, safe_resize
	import torch

	if approach == "goal1_sam":
	# Goal1 approach: Different preprocessing for better SAM results
	depth_img = pad_to_multiple_of_14(safe_resize(img_rgb)) # For depth (960px + padding)
	sam_img = safe_resize(img_rgb, long_side=1024) # For SAM (1024px, no padding)

	# Generate depth map
	ten = torch.from_numpy(depth_img).permute(2,0,1).unsqueeze(0).float() / 255
	with torch.no_grad():
	depth = depth_net(ten.to(device)).squeeze().cpu().numpy()

	# Generate masks with Goal1 approach
	masks = mask_gen.generate(sam_img)

	return depth, masks, depth_img, sam_img

	else: # "aligned"
	# Goal2 approach: Same preprocessing for perfect alignment
	img_processed = pad_to_multiple_of_14(safe_resize(img_rgb))

	# Generate depth map
	ten = torch.from_numpy(img_processed).permute(2,0,1).unsqueeze(0).float() / 255
	with torch.no_grad():
	depth = depth_net(ten.to(device)).squeeze().cpu().numpy()

	# Generate masks on the same processed image
	masks = mask_gen.generate(img_processed)

	return depth, masks, img_processed