import cv2 import numpy as np import torch from segment_anything import sam_model_registry, SamPredictor from ultralytics import YOLO import mediapipe as mp import json from datetime import datetime import time import os def process_seat_depth_analysis(image_path, eye_to_ear_cm=7.0, sam_checkpoint="sam_vit_b_01ec64.pth"): """ Main function to process seat depth analysis Args: image_path: Path to the input image eye_to_ear_cm: Real-world eye to ear distance for scaling (default 7.0 cm) sam_checkpoint: Path to SAM model checkpoint Returns: tuple: (output_json, pose_image, seat_band_image, final_image) """ start_time = time.time() def put_text_safe(image, text, org, font, font_scale, color, thickness): text_size, _ = cv2.getTextSize(text, font, font_scale, thickness) text_width, text_height = text_size x, y = org h, w = image.shape[:2] # Adjust X if text goes out on the right if x + text_width > w: x = w - text_width - 5 # 5 pixel padding from right # Adjust X if text goes out on the left if x < 0: x = 5 # 5 pixel padding from left # Adjust Y if text goes above image if y - text_height < 0: y = text_height + 5 # push down # Adjust Y if text goes below image if y > h: y = h - 5 cv2.putText(image, text, (x, y), font, font_scale, color, thickness) # === Load image === image_bgr = cv2.imread(image_path) if image_bgr is None: raise ValueError(f"Could not load image from {image_path}") image_rgb = cv2.cvtColor(image_bgr, cv2.COLOR_BGR2RGB) h, w = image_rgb.shape[:2] # === Run MediaPipe Pose Detection === mp_pose = mp.solutions.pose pose = mp_pose.Pose(static_image_mode=True) results = pose.process(image_rgb) if not results.pose_landmarks: raise ValueError("No pose detected in the image") landmarks = results.pose_landmarks.landmark # === Get Knee and Eye X,Y coordinates === left_knee = landmarks[25] right_knee = landmarks[26] left_eye = landmarks[2] right_eye = landmarks[5] right_ear = landmarks[8] left_ear = landmarks[7] left_hip = landmarks[23] right_hip = landmarks[24] # Convert to pixel coordinates left_knee_px = (int(left_knee.x * w), int(left_knee.y * h)) right_knee_px = (int(right_knee.x * w), int(right_knee.y * h)) left_eye_px = (int(left_eye.x * w), int(left_eye.y * h)) right_eye_px = (int(right_eye.x * w), int(right_eye.y * h)) left_ear_px = (int(left_ear.x * w), int(left_ear.y * h)) right_ear_px = (int(right_ear.x * w), int(right_ear.y * h)) left_hip_px = (int(left_hip.x * w), int(left_hip.y * h)) right_hip_px = (int(right_hip.x * w), int(right_hip.y * h)) # === Determine Facing Direction === avg_knee_x = (left_knee_px[0] + right_knee_px[0]) / 2 avg_eye_x = (left_eye_px[0] + right_eye_px[0]) / 2 facing_direction = "right" if avg_knee_x > avg_eye_x else "left" # === Create Pose Overlay (Image 1) === pose_image = image_rgb.copy() mp_drawing = mp.solutions.drawing_utils mp_drawing_styles = mp.solutions.drawing_styles mp_drawing.draw_landmarks( pose_image, results.pose_landmarks, mp_pose.POSE_CONNECTIONS, landmark_drawing_spec=mp_drawing_styles.get_default_pose_landmarks_style() ) # === Step 1: Detect Chair with YOLOv8 === yolo_model = YOLO("yolov8n.pt") yolo_results = yolo_model(image_rgb) # === Step 2: Get Chair Box === chair_box = None chair_confidence = 0.0 for result in yolo_results: for box, cls, conf in zip(result.boxes.xyxy, result.boxes.cls, result.boxes.conf): if int(cls.item()) == 56: # 56 = chair chair_box = box.cpu().numpy().astype(int) chair_confidence = float(conf.item()) break if chair_box is None: raise ValueError("No chair detected in the image") x1, y1, x2, y2 = chair_box chair_height = y2 - y1 adjusted_y1 = y1 + int(0.25 * chair_height) input_box = np.array([x1, adjusted_y1, x2, y2]) # === Step 3: Load SAM === model_type = "vit_b" device = "cuda" if torch.cuda.is_available() else "cpu" sam = sam_model_registry[model_type](checkpoint=sam_checkpoint) sam.to(device=device) predictor = SamPredictor(sam) # === Step 4: Predict Mask from Bounding Box === predictor.set_image(image_rgb) masks, scores, _ = predictor.predict(box=input_box[None, :], multimask_output=True) best_mask = masks[np.argmax(scores)] # === Step 5: Largest Component Only === def get_largest_connected_component(mask): num_labels, labels, stats, _ = cv2.connectedComponentsWithStats(mask.astype(np.uint8), connectivity=8) if num_labels <= 1: return mask largest_label = 1 + np.argmax(stats[1:, cv2.CC_STAT_AREA]) return labels == largest_label cleaned_mask = get_largest_connected_component(best_mask) # === Step 6: Estimate Seat Front === knee_y = int((left_knee_px[1] + right_knee_px[1]) / 2) band_thickness = chair_height // 2 y_min = max(0, knee_y - band_thickness) y_max = min(h, knee_y + band_thickness) band = cleaned_mask[y_min:y_max, :] chair_pixels_x = np.where(band)[1] if chair_pixels_x.size == 0: raise ValueError("No chair pixels detected at knee level") seat_front_x = chair_pixels_x.max() if facing_direction == "right" else chair_pixels_x.min() seat_front_y = knee_y # === Create Seat Front Band Visualization (Image 2) === seat_band_image = image_rgb.copy() cv2.line(seat_band_image, (0, y_min), (w, y_min), (0, 255, 0), 2) cv2.line(seat_band_image, (0, y_max), (w, y_max), (0, 255, 0), 2) cv2.circle(seat_band_image, (seat_front_x, seat_front_y), 8, (0, 0, 255), -1) put_text_safe(seat_band_image, "Seat Front", (seat_front_x + 10, seat_front_y - 10), cv2.FONT_HERSHEY_SIMPLEX, 0.7, (0, 0, 255), 2) # === Calculate Back of Knee Position === def euclidean_distance(p1, p2): return np.linalg.norm(np.array(p1) - np.array(p2)) # Calculate thigh length for proportional offset if facing_direction == "right": hip_pt = right_hip_px knee_pt_original = right_knee_px else: hip_pt = left_hip_px knee_pt_original = left_knee_px thigh_length_px = euclidean_distance(hip_pt, knee_pt_original) # Back of knee is typically 12-15% of thigh length behind knee center back_of_knee_offset = thigh_length_px * 0.13 # 13% of thigh length # Apply offset in the backward direction if facing_direction == "right": knee_pt = (int(knee_pt_original[0] - back_of_knee_offset), knee_pt_original[1]) else: knee_pt = (int(knee_pt_original[0] + back_of_knee_offset), knee_pt_original[1]) # === Calculate Measurements === clearance_px = abs(seat_front_x - knee_pt[0]) # Check visibility and calculate eye-to-ear distance visibility_warnings = [] if facing_direction == "right" and (right_eye.visibility < 0.5 or right_ear.visibility < 0.5): visibility_warnings.append("Right eye or ear not clearly visible. Scaling may be inaccurate.") elif facing_direction == "left" and (left_eye.visibility < 0.5 or left_ear.visibility < 0.5): visibility_warnings.append("Left eye or ear not clearly visible. Scaling may be inaccurate.") if facing_direction == "right": eye_coord = right_eye_px ear_coord = right_ear_px else: eye_coord = left_eye_px ear_coord = left_ear_px eye_to_ear_px = euclidean_distance(eye_coord, ear_coord) pixels_per_cm = eye_to_ear_px / eye_to_ear_cm clearance_cm = clearance_px / pixels_per_cm # Determine if back of knee is behind seat front if facing_direction == "right": knee_behind_seat = knee_pt[0] < seat_front_x else: knee_behind_seat = knee_pt[0] > seat_front_x # === Classification === category = "Too Short" if knee_behind_seat or clearance_cm < 2: if clearance_cm < 2: category = "Too Deep" reasoning = f"Clearance of {clearance_cm:.2f}cm is less than 2cm minimum" elif knee_behind_seat: category = "Too Deep" reasoning = "Back of knee is behind seat front" elif clearance_cm <= 6: category = "Optimal" reasoning = f"Clearance of {clearance_cm:.2f}cm falls within optimal range (2-6cm)" else: category = "Too Short" reasoning = f"Clearance of {clearance_cm:.2f}cm exceeds 6cm optimal maximum" # === Create Final Visualization (Image 3) === final_image = image_rgb.copy() # Draw seat front and knee cv2.circle(final_image, (seat_front_x, seat_front_y), 8, (0, 0, 255), -1) cv2.circle(final_image, knee_pt, 8, (255, 0, 0), -1) # Height at which the line floats line_y = min(seat_front_y, knee_pt[1]) - 30 # Draw horizontal line (floating) cv2.line(final_image, (min(seat_front_x, knee_pt[0]), line_y), (max(seat_front_x, knee_pt[0]), line_y), (255, 255, 0), 2) # Add arrow tips cv2.arrowedLine(final_image, (min(seat_front_x, knee_pt[0]) + 20, line_y), (min(seat_front_x, knee_pt[0]), line_y), (255, 255, 0), 2, tipLength=0.4) cv2.arrowedLine(final_image, (max(seat_front_x, knee_pt[0]) - 20, line_y), (max(seat_front_x, knee_pt[0]), line_y), (255, 255, 0), 2, tipLength=0.4) # Put clearance text above the line put_text_safe(final_image, f"Knee clearance: {clearance_cm:.1f} cm", (min(seat_front_x, knee_pt[0]) + 10, line_y - 10), cv2.FONT_HERSHEY_SIMPLEX, 0.8, (0, 255, 255), 2) # Draw eye-to-ear line cv2.line(final_image, eye_coord, ear_coord, (0, 255, 0), 2) put_text_safe(final_image, f"{eye_to_ear_cm:.1f}cm", (eye_coord[0], eye_coord[1] - 10), cv2.FONT_HERSHEY_SIMPLEX, 0.6, (0, 255, 0), 2) # === Generate JSON Output === processing_time = int((time.time() - start_time) * 1000) output_json = { "frame_id": os.path.basename(image_path), "timestamp": datetime.now().isoformat(), "pose_detection": { "pose_detected": True, "facing_direction": facing_direction, "landmarks_visibility": { "left_eye": float(left_eye.visibility), "right_eye": float(right_eye.visibility), "left_ear": float(left_ear.visibility), "right_ear": float(right_ear.visibility), "left_knee": float(left_knee.visibility), "right_knee": float(right_knee.visibility), "left_hip": float(left_hip.visibility), "right_hip": float(right_hip.visibility) } }, "chair_detection": { "chair_detected": True, "chair_bbox": chair_box.tolist(), "chair_confidence": chair_confidence }, "measurements": { "eye_to_ear_distance_px": float(eye_to_ear_px), "eye_to_ear_distance_cm": float(eye_to_ear_cm), "pixels_per_cm": float(pixels_per_cm), "seat_front_position": [int(seat_front_x), int(seat_front_y)], "back_of_knee_position": [int(knee_pt[0]), int(knee_pt[1])], "knee_clearance_px": float(clearance_px), "knee_clearance_cm": float(clearance_cm), "thigh_length_px": float(thigh_length_px), "back_of_knee_offset_applied": float(back_of_knee_offset) }, "classification": { "category": category, "knee_behind_seat": bool(knee_behind_seat), "reasoning": reasoning }, "debug_info": { "band_y_range": [int(y_min), int(y_max)], "chair_pixels_detected": int(chair_pixels_x.size), "segmentation_success": True, "scaling_method": "eye_to_ear_reference" }, "warnings": visibility_warnings, "processing_time_ms": processing_time } return output_json, pose_image, seat_band_image, final_image