seat-depth-analyser / seat_depth_analysis.py
nitikaborkar's picture
Upload 20 files
a6beb58 verified
import cv2
import numpy as np
import torch
from segment_anything import sam_model_registry, SamPredictor
from ultralytics import YOLO
import mediapipe as mp
import json
from datetime import datetime
import time
import os
def process_seat_depth_analysis(image_path, eye_to_ear_cm=7.0, sam_checkpoint="sam_vit_b_01ec64.pth"):
"""
Main function to process seat depth analysis
Args:
image_path: Path to the input image
eye_to_ear_cm: Real-world eye to ear distance for scaling (default 7.0 cm)
sam_checkpoint: Path to SAM model checkpoint
Returns:
tuple: (output_json, pose_image, seat_band_image, final_image)
"""
start_time = time.time()
def put_text_safe(image, text, org, font, font_scale, color, thickness):
text_size, _ = cv2.getTextSize(text, font, font_scale, thickness)
text_width, text_height = text_size
x, y = org
h, w = image.shape[:2]
# Adjust X if text goes out on the right
if x + text_width > w:
x = w - text_width - 5 # 5 pixel padding from right
# Adjust X if text goes out on the left
if x < 0:
x = 5 # 5 pixel padding from left
# Adjust Y if text goes above image
if y - text_height < 0:
y = text_height + 5 # push down
# Adjust Y if text goes below image
if y > h:
y = h - 5
cv2.putText(image, text, (x, y), font, font_scale, color, thickness)
# === Load image ===
image_bgr = cv2.imread(image_path)
if image_bgr is None:
raise ValueError(f"Could not load image from {image_path}")
image_rgb = cv2.cvtColor(image_bgr, cv2.COLOR_BGR2RGB)
h, w = image_rgb.shape[:2]
# === Run MediaPipe Pose Detection ===
mp_pose = mp.solutions.pose
pose = mp_pose.Pose(static_image_mode=True)
results = pose.process(image_rgb)
if not results.pose_landmarks:
raise ValueError("No pose detected in the image")
landmarks = results.pose_landmarks.landmark
# === Get Knee and Eye X,Y coordinates ===
left_knee = landmarks[25]
right_knee = landmarks[26]
left_eye = landmarks[2]
right_eye = landmarks[5]
right_ear = landmarks[8]
left_ear = landmarks[7]
left_hip = landmarks[23]
right_hip = landmarks[24]
# Convert to pixel coordinates
left_knee_px = (int(left_knee.x * w), int(left_knee.y * h))
right_knee_px = (int(right_knee.x * w), int(right_knee.y * h))
left_eye_px = (int(left_eye.x * w), int(left_eye.y * h))
right_eye_px = (int(right_eye.x * w), int(right_eye.y * h))
left_ear_px = (int(left_ear.x * w), int(left_ear.y * h))
right_ear_px = (int(right_ear.x * w), int(right_ear.y * h))
left_hip_px = (int(left_hip.x * w), int(left_hip.y * h))
right_hip_px = (int(right_hip.x * w), int(right_hip.y * h))
# === Determine Facing Direction ===
avg_knee_x = (left_knee_px[0] + right_knee_px[0]) / 2
avg_eye_x = (left_eye_px[0] + right_eye_px[0]) / 2
facing_direction = "right" if avg_knee_x > avg_eye_x else "left"
# === Create Pose Overlay (Image 1) ===
pose_image = image_rgb.copy()
mp_drawing = mp.solutions.drawing_utils
mp_drawing_styles = mp.solutions.drawing_styles
mp_drawing.draw_landmarks(
pose_image,
results.pose_landmarks,
mp_pose.POSE_CONNECTIONS,
landmark_drawing_spec=mp_drawing_styles.get_default_pose_landmarks_style()
)
# === Step 1: Detect Chair with YOLOv8 ===
yolo_model = YOLO("yolov8n.pt")
yolo_results = yolo_model(image_rgb)
# === Step 2: Get Chair Box ===
chair_box = None
chair_confidence = 0.0
for result in yolo_results:
for box, cls, conf in zip(result.boxes.xyxy, result.boxes.cls, result.boxes.conf):
if int(cls.item()) == 56: # 56 = chair
chair_box = box.cpu().numpy().astype(int)
chair_confidence = float(conf.item())
break
if chair_box is None:
raise ValueError("No chair detected in the image")
x1, y1, x2, y2 = chair_box
chair_height = y2 - y1
adjusted_y1 = y1 + int(0.25 * chair_height)
input_box = np.array([x1, adjusted_y1, x2, y2])
# === Step 3: Load SAM ===
model_type = "vit_b"
device = "cuda" if torch.cuda.is_available() else "cpu"
sam = sam_model_registry[model_type](checkpoint=sam_checkpoint)
sam.to(device=device)
predictor = SamPredictor(sam)
# === Step 4: Predict Mask from Bounding Box ===
predictor.set_image(image_rgb)
masks, scores, _ = predictor.predict(box=input_box[None, :], multimask_output=True)
best_mask = masks[np.argmax(scores)]
# === Step 5: Largest Component Only ===
def get_largest_connected_component(mask):
num_labels, labels, stats, _ = cv2.connectedComponentsWithStats(mask.astype(np.uint8), connectivity=8)
if num_labels <= 1:
return mask
largest_label = 1 + np.argmax(stats[1:, cv2.CC_STAT_AREA])
return labels == largest_label
cleaned_mask = get_largest_connected_component(best_mask)
# === Step 6: Estimate Seat Front ===
knee_y = int((left_knee_px[1] + right_knee_px[1]) / 2)
band_thickness = chair_height // 2
y_min = max(0, knee_y - band_thickness)
y_max = min(h, knee_y + band_thickness)
band = cleaned_mask[y_min:y_max, :]
chair_pixels_x = np.where(band)[1]
if chair_pixels_x.size == 0:
raise ValueError("No chair pixels detected at knee level")
seat_front_x = chair_pixels_x.max() if facing_direction == "right" else chair_pixels_x.min()
seat_front_y = knee_y
# === Create Seat Front Band Visualization (Image 2) ===
seat_band_image = image_rgb.copy()
cv2.line(seat_band_image, (0, y_min), (w, y_min), (0, 255, 0), 2)
cv2.line(seat_band_image, (0, y_max), (w, y_max), (0, 255, 0), 2)
cv2.circle(seat_band_image, (seat_front_x, seat_front_y), 8, (0, 0, 255), -1)
put_text_safe(seat_band_image, "Seat Front", (seat_front_x + 10, seat_front_y - 10),
cv2.FONT_HERSHEY_SIMPLEX, 0.7, (0, 0, 255), 2)
# === Calculate Back of Knee Position ===
def euclidean_distance(p1, p2):
return np.linalg.norm(np.array(p1) - np.array(p2))
# Calculate thigh length for proportional offset
if facing_direction == "right":
hip_pt = right_hip_px
knee_pt_original = right_knee_px
else:
hip_pt = left_hip_px
knee_pt_original = left_knee_px
thigh_length_px = euclidean_distance(hip_pt, knee_pt_original)
# Back of knee is typically 12-15% of thigh length behind knee center
back_of_knee_offset = thigh_length_px * 0.13 # 13% of thigh length
# Apply offset in the backward direction
if facing_direction == "right":
knee_pt = (int(knee_pt_original[0] - back_of_knee_offset), knee_pt_original[1])
else:
knee_pt = (int(knee_pt_original[0] + back_of_knee_offset), knee_pt_original[1])
# === Calculate Measurements ===
clearance_px = abs(seat_front_x - knee_pt[0])
# Check visibility and calculate eye-to-ear distance
visibility_warnings = []
if facing_direction == "right" and (right_eye.visibility < 0.5 or right_ear.visibility < 0.5):
visibility_warnings.append("Right eye or ear not clearly visible. Scaling may be inaccurate.")
elif facing_direction == "left" and (left_eye.visibility < 0.5 or left_ear.visibility < 0.5):
visibility_warnings.append("Left eye or ear not clearly visible. Scaling may be inaccurate.")
if facing_direction == "right":
eye_coord = right_eye_px
ear_coord = right_ear_px
else:
eye_coord = left_eye_px
ear_coord = left_ear_px
eye_to_ear_px = euclidean_distance(eye_coord, ear_coord)
pixels_per_cm = eye_to_ear_px / eye_to_ear_cm
clearance_cm = clearance_px / pixels_per_cm
# Determine if back of knee is behind seat front
if facing_direction == "right":
knee_behind_seat = knee_pt[0] < seat_front_x
else:
knee_behind_seat = knee_pt[0] > seat_front_x
# === Classification ===
category = "Too Short"
if knee_behind_seat or clearance_cm < 2:
if clearance_cm < 2:
category = "Too Deep"
reasoning = f"Clearance of {clearance_cm:.2f}cm is less than 2cm minimum"
elif knee_behind_seat:
category = "Too Deep"
reasoning = "Back of knee is behind seat front"
elif clearance_cm <= 6:
category = "Optimal"
reasoning = f"Clearance of {clearance_cm:.2f}cm falls within optimal range (2-6cm)"
else:
category = "Too Short"
reasoning = f"Clearance of {clearance_cm:.2f}cm exceeds 6cm optimal maximum"
# === Create Final Visualization (Image 3) ===
final_image = image_rgb.copy()
# Draw seat front and knee
cv2.circle(final_image, (seat_front_x, seat_front_y), 8, (0, 0, 255), -1)
cv2.circle(final_image, knee_pt, 8, (255, 0, 0), -1)
# Height at which the line floats
line_y = min(seat_front_y, knee_pt[1]) - 30
# Draw horizontal line (floating)
cv2.line(final_image, (min(seat_front_x, knee_pt[0]), line_y),
(max(seat_front_x, knee_pt[0]), line_y),
(255, 255, 0), 2)
# Add arrow tips
cv2.arrowedLine(final_image,
(min(seat_front_x, knee_pt[0]) + 20, line_y),
(min(seat_front_x, knee_pt[0]), line_y),
(255, 255, 0), 2, tipLength=0.4)
cv2.arrowedLine(final_image,
(max(seat_front_x, knee_pt[0]) - 20, line_y),
(max(seat_front_x, knee_pt[0]), line_y),
(255, 255, 0), 2, tipLength=0.4)
# Put clearance text above the line
put_text_safe(final_image, f"Knee clearance: {clearance_cm:.1f} cm",
(min(seat_front_x, knee_pt[0]) + 10, line_y - 10),
cv2.FONT_HERSHEY_SIMPLEX, 0.8, (0, 255, 255), 2)
# Draw eye-to-ear line
cv2.line(final_image, eye_coord, ear_coord, (0, 255, 0), 2)
put_text_safe(final_image, f"{eye_to_ear_cm:.1f}cm",
(eye_coord[0], eye_coord[1] - 10),
cv2.FONT_HERSHEY_SIMPLEX, 0.6, (0, 255, 0), 2)
# === Generate JSON Output ===
processing_time = int((time.time() - start_time) * 1000)
output_json = {
"frame_id": os.path.basename(image_path),
"timestamp": datetime.now().isoformat(),
"pose_detection": {
"pose_detected": True,
"facing_direction": facing_direction,
"landmarks_visibility": {
"left_eye": float(left_eye.visibility),
"right_eye": float(right_eye.visibility),
"left_ear": float(left_ear.visibility),
"right_ear": float(right_ear.visibility),
"left_knee": float(left_knee.visibility),
"right_knee": float(right_knee.visibility),
"left_hip": float(left_hip.visibility),
"right_hip": float(right_hip.visibility)
}
},
"chair_detection": {
"chair_detected": True,
"chair_bbox": chair_box.tolist(),
"chair_confidence": chair_confidence
},
"measurements": {
"eye_to_ear_distance_px": float(eye_to_ear_px),
"eye_to_ear_distance_cm": float(eye_to_ear_cm),
"pixels_per_cm": float(pixels_per_cm),
"seat_front_position": [int(seat_front_x), int(seat_front_y)],
"back_of_knee_position": [int(knee_pt[0]), int(knee_pt[1])],
"knee_clearance_px": float(clearance_px),
"knee_clearance_cm": float(clearance_cm),
"thigh_length_px": float(thigh_length_px),
"back_of_knee_offset_applied": float(back_of_knee_offset)
},
"classification": {
"category": category,
"knee_behind_seat": bool(knee_behind_seat),
"reasoning": reasoning
},
"debug_info": {
"band_y_range": [int(y_min), int(y_max)],
"chair_pixels_detected": int(chair_pixels_x.size),
"segmentation_success": True,
"scaling_method": "eye_to_ear_reference"
},
"warnings": visibility_warnings,
"processing_time_ms": processing_time
}
return output_json, pose_image, seat_band_image, final_image