Spaces:

nitikaborkar
/

seat-depth-analyser

Sleeping

File size: 12,465 Bytes

a6beb58

import cv2
import numpy as np
import torch
from segment_anything import sam_model_registry, SamPredictor
from ultralytics import YOLO
import mediapipe as mp
import json
from datetime import datetime
import time
import os

def process_seat_depth_analysis(image_path, eye_to_ear_cm=7.0, sam_checkpoint="sam_vit_b_01ec64.pth"):
    """
    Main function to process seat depth analysis
    
    Args:
        image_path: Path to the input image
        eye_to_ear_cm: Real-world eye to ear distance for scaling (default 7.0 cm)
        sam_checkpoint: Path to SAM model checkpoint
    
    Returns:
        tuple: (output_json, pose_image, seat_band_image, final_image)
    """
    start_time = time.time()

    def put_text_safe(image, text, org, font, font_scale, color, thickness):
        text_size, _ = cv2.getTextSize(text, font, font_scale, thickness)
        text_width, text_height = text_size
        x, y = org
        h, w = image.shape[:2]

        # Adjust X if text goes out on the right
        if x + text_width > w:
            x = w - text_width - 5  # 5 pixel padding from right

        # Adjust X if text goes out on the left
        if x < 0:
            x = 5  # 5 pixel padding from left

        # Adjust Y if text goes above image
        if y - text_height < 0:
            y = text_height + 5  # push down

        # Adjust Y if text goes below image
        if y > h:
            y = h - 5

        cv2.putText(image, text, (x, y), font, font_scale, color, thickness)

        
    # === Load image ===
    image_bgr = cv2.imread(image_path)
    if image_bgr is None:
        raise ValueError(f"Could not load image from {image_path}")
    
    image_rgb = cv2.cvtColor(image_bgr, cv2.COLOR_BGR2RGB)
    h, w = image_rgb.shape[:2]

    # === Run MediaPipe Pose Detection ===
    mp_pose = mp.solutions.pose
    pose = mp_pose.Pose(static_image_mode=True)
    results = pose.process(image_rgb)

    if not results.pose_landmarks:
        raise ValueError("No pose detected in the image")

    landmarks = results.pose_landmarks.landmark

    # === Get Knee and Eye X,Y coordinates ===
    left_knee = landmarks[25]
    right_knee = landmarks[26]
    left_eye = landmarks[2]
    right_eye = landmarks[5]
    right_ear = landmarks[8]
    left_ear = landmarks[7]
    left_hip = landmarks[23]
    right_hip = landmarks[24]

    # Convert to pixel coordinates
    left_knee_px = (int(left_knee.x * w), int(left_knee.y * h))
    right_knee_px = (int(right_knee.x * w), int(right_knee.y * h))
    left_eye_px = (int(left_eye.x * w), int(left_eye.y * h))
    right_eye_px = (int(right_eye.x * w), int(right_eye.y * h))
    left_ear_px = (int(left_ear.x * w), int(left_ear.y * h))
    right_ear_px = (int(right_ear.x * w), int(right_ear.y * h))
    left_hip_px = (int(left_hip.x * w), int(left_hip.y * h))
    right_hip_px = (int(right_hip.x * w), int(right_hip.y * h))

    # === Determine Facing Direction ===
    avg_knee_x = (left_knee_px[0] + right_knee_px[0]) / 2
    avg_eye_x = (left_eye_px[0] + right_eye_px[0]) / 2
    facing_direction = "right" if avg_knee_x > avg_eye_x else "left"

    # === Create Pose Overlay (Image 1) ===
    pose_image = image_rgb.copy()
    mp_drawing = mp.solutions.drawing_utils
    mp_drawing_styles = mp.solutions.drawing_styles

    mp_drawing.draw_landmarks(
        pose_image,
        results.pose_landmarks,
        mp_pose.POSE_CONNECTIONS,
        landmark_drawing_spec=mp_drawing_styles.get_default_pose_landmarks_style()
    )

    # === Step 1: Detect Chair with YOLOv8 ===
    yolo_model = YOLO("yolov8n.pt")
    yolo_results = yolo_model(image_rgb)

    # === Step 2: Get Chair Box ===
    chair_box = None
    chair_confidence = 0.0
    for result in yolo_results:
        for box, cls, conf in zip(result.boxes.xyxy, result.boxes.cls, result.boxes.conf):
            if int(cls.item()) == 56:  # 56 = chair
                chair_box = box.cpu().numpy().astype(int)
                chair_confidence = float(conf.item())
                break

    if chair_box is None:
        raise ValueError("No chair detected in the image")

    x1, y1, x2, y2 = chair_box
    chair_height = y2 - y1
    adjusted_y1 = y1 + int(0.25 * chair_height)
    input_box = np.array([x1, adjusted_y1, x2, y2])

    # === Step 3: Load SAM ===
    model_type = "vit_b"
    device = "cuda" if torch.cuda.is_available() else "cpu"

    sam = sam_model_registry[model_type](checkpoint=sam_checkpoint)
    sam.to(device=device)
    predictor = SamPredictor(sam)

    # === Step 4: Predict Mask from Bounding Box ===
    predictor.set_image(image_rgb)
    masks, scores, _ = predictor.predict(box=input_box[None, :], multimask_output=True)
    best_mask = masks[np.argmax(scores)]

    # === Step 5: Largest Component Only ===
    def get_largest_connected_component(mask):
        num_labels, labels, stats, _ = cv2.connectedComponentsWithStats(mask.astype(np.uint8), connectivity=8)
        if num_labels <= 1:
            return mask
        largest_label = 1 + np.argmax(stats[1:, cv2.CC_STAT_AREA])
        return labels == largest_label

    cleaned_mask = get_largest_connected_component(best_mask)

    # === Step 6: Estimate Seat Front ===
    knee_y = int((left_knee_px[1] + right_knee_px[1]) / 2)
    band_thickness = chair_height // 2
    y_min = max(0, knee_y - band_thickness)
    y_max = min(h, knee_y + band_thickness)
    band = cleaned_mask[y_min:y_max, :]
    chair_pixels_x = np.where(band)[1]

    if chair_pixels_x.size == 0:
        raise ValueError("No chair pixels detected at knee level")

    seat_front_x = chair_pixels_x.max() if facing_direction == "right" else chair_pixels_x.min()
    seat_front_y = knee_y

    # === Create Seat Front Band Visualization (Image 2) ===
    seat_band_image = image_rgb.copy()
    cv2.line(seat_band_image, (0, y_min), (w, y_min), (0, 255, 0), 2)
    cv2.line(seat_band_image, (0, y_max), (w, y_max), (0, 255, 0), 2)
    cv2.circle(seat_band_image, (seat_front_x, seat_front_y), 8, (0, 0, 255), -1)
    put_text_safe(seat_band_image, "Seat Front", (seat_front_x + 10, seat_front_y - 10),
              cv2.FONT_HERSHEY_SIMPLEX, 0.7, (0, 0, 255), 2)

    # === Calculate Back of Knee Position ===
    def euclidean_distance(p1, p2):
        return np.linalg.norm(np.array(p1) - np.array(p2))

    # Calculate thigh length for proportional offset
    if facing_direction == "right":
        hip_pt = right_hip_px
        knee_pt_original = right_knee_px
    else:
        hip_pt = left_hip_px
        knee_pt_original = left_knee_px

    thigh_length_px = euclidean_distance(hip_pt, knee_pt_original)

    # Back of knee is typically 12-15% of thigh length behind knee center
    back_of_knee_offset = thigh_length_px * 0.13  # 13% of thigh length

    # Apply offset in the backward direction
    if facing_direction == "right":
        knee_pt = (int(knee_pt_original[0] - back_of_knee_offset), knee_pt_original[1])
    else:
        knee_pt = (int(knee_pt_original[0] + back_of_knee_offset), knee_pt_original[1])

    # === Calculate Measurements ===
    clearance_px = abs(seat_front_x - knee_pt[0])

    # Check visibility and calculate eye-to-ear distance
    visibility_warnings = []
    if facing_direction == "right" and (right_eye.visibility < 0.5 or right_ear.visibility < 0.5):
        visibility_warnings.append("Right eye or ear not clearly visible. Scaling may be inaccurate.")
    elif facing_direction == "left" and (left_eye.visibility < 0.5 or left_ear.visibility < 0.5):
        visibility_warnings.append("Left eye or ear not clearly visible. Scaling may be inaccurate.")

    if facing_direction == "right":
        eye_coord = right_eye_px
        ear_coord = right_ear_px
    else:
        eye_coord = left_eye_px
        ear_coord = left_ear_px

    eye_to_ear_px = euclidean_distance(eye_coord, ear_coord)
    pixels_per_cm = eye_to_ear_px / eye_to_ear_cm
    clearance_cm = clearance_px / pixels_per_cm

    # Determine if back of knee is behind seat front
    if facing_direction == "right":
        knee_behind_seat = knee_pt[0] < seat_front_x
    else:
        knee_behind_seat = knee_pt[0] > seat_front_x

    # === Classification ===

    category = "Too Short"
    if knee_behind_seat or clearance_cm < 2:
        if clearance_cm < 2:
            category = "Too Deep"
            reasoning = f"Clearance of {clearance_cm:.2f}cm is less than 2cm minimum"
        elif knee_behind_seat:
            category = "Too Deep"
            reasoning = "Back of knee is behind seat front"
    elif clearance_cm <= 6:
        category = "Optimal"
        reasoning = f"Clearance of {clearance_cm:.2f}cm falls within optimal range (2-6cm)"
    else:
        category = "Too Short"
        reasoning = f"Clearance of {clearance_cm:.2f}cm exceeds 6cm optimal maximum"

    # === Create Final Visualization (Image 3) ===
    final_image = image_rgb.copy()

    # Draw seat front and knee
    cv2.circle(final_image, (seat_front_x, seat_front_y), 8, (0, 0, 255), -1)
    cv2.circle(final_image, knee_pt, 8, (255, 0, 0), -1)

    # Height at which the line floats
    line_y = min(seat_front_y, knee_pt[1]) - 30

    # Draw horizontal line (floating)
    cv2.line(final_image, (min(seat_front_x, knee_pt[0]), line_y), 
                   (max(seat_front_x, knee_pt[0]), line_y), 
             (255, 255, 0), 2)

    # Add arrow tips
    cv2.arrowedLine(final_image,
                    (min(seat_front_x, knee_pt[0]) + 20, line_y),
                    (min(seat_front_x, knee_pt[0]), line_y),
                    (255, 255, 0), 2, tipLength=0.4)

    cv2.arrowedLine(final_image,
                    (max(seat_front_x, knee_pt[0]) - 20, line_y),
                    (max(seat_front_x, knee_pt[0]), line_y),
                    (255, 255, 0), 2, tipLength=0.4)

    # Put clearance text above the line
    put_text_safe(final_image, f"Knee clearance: {clearance_cm:.1f} cm", 
              (min(seat_front_x, knee_pt[0]) + 10, line_y - 10),
              cv2.FONT_HERSHEY_SIMPLEX, 0.8, (0, 255, 255), 2)

    # Draw eye-to-ear line
    cv2.line(final_image, eye_coord, ear_coord, (0, 255, 0), 2)
    put_text_safe(final_image, f"{eye_to_ear_cm:.1f}cm", 
              (eye_coord[0], eye_coord[1] - 10), 
              cv2.FONT_HERSHEY_SIMPLEX, 0.6, (0, 255, 0), 2)

    # === Generate JSON Output ===
    processing_time = int((time.time() - start_time) * 1000)
    
    output_json = {
       "frame_id": os.path.basename(image_path),
        "timestamp": datetime.now().isoformat(),
        "pose_detection": {
            "pose_detected": True,
            "facing_direction": facing_direction,
            "landmarks_visibility": {
                "left_eye": float(left_eye.visibility),
                "right_eye": float(right_eye.visibility),
                "left_ear": float(left_ear.visibility),
                "right_ear": float(right_ear.visibility),
                "left_knee": float(left_knee.visibility),
                "right_knee": float(right_knee.visibility),
                "left_hip": float(left_hip.visibility),
                "right_hip": float(right_hip.visibility)
            }
        },
        "chair_detection": {
            "chair_detected": True,
            "chair_bbox": chair_box.tolist(),
            "chair_confidence": chair_confidence
        },
        "measurements": {
            "eye_to_ear_distance_px": float(eye_to_ear_px),
            "eye_to_ear_distance_cm": float(eye_to_ear_cm),
            "pixels_per_cm": float(pixels_per_cm),
            "seat_front_position": [int(seat_front_x), int(seat_front_y)],
            "back_of_knee_position": [int(knee_pt[0]), int(knee_pt[1])],
            "knee_clearance_px": float(clearance_px),
            "knee_clearance_cm": float(clearance_cm),
            "thigh_length_px": float(thigh_length_px),
            "back_of_knee_offset_applied": float(back_of_knee_offset)
        },
        "classification": {
            "category": category,
            "knee_behind_seat": bool(knee_behind_seat),
            "reasoning": reasoning
        },
        "debug_info": {
            "band_y_range": [int(y_min), int(y_max)],
            "chair_pixels_detected": int(chair_pixels_x.size),
            "segmentation_success": True,
            "scaling_method": "eye_to_ear_reference"
        },
        "warnings": visibility_warnings,
        "processing_time_ms": processing_time
    }

    return output_json, pose_image, seat_band_image, final_image