File size: 3,008 Bytes
13aa528
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
import cv2
import numpy as np
from torchvision.transforms.v2 import ToPILImage
from PIL import Image
from transformers import pipeline
import torch
from imgutils.detect import detect_heads
from src.utils.device import determine_accelerator
topil = ToPILImage()

# 1. Initialize the filtering pipeline
device = determine_accelerator()

print("Loading AI Model...")
pipe = pipeline(
    "image-text-to-text",
    model="google/gemma-3-12b-it",
    device=device,
    torch_dtype=torch.bfloat16,
)

def crop_and_mask_characters_gradio(pil_img):
    """
    Crops character regions from an image, saves them as separate files,
    and generates binary masks for each cropped region using the Gemini 2.0 Flash Exp model.

    Args:
        uploaded_file_obj (str): The path to the input image.
    """
    img = np.array(pil_img)

    # Convert the image to grayscale
    gray = cv2.cvtColor(img, cv2.COLOR_BGR2GRAY)

    # Apply thresholding to create a binary image
    _, thresh = cv2.threshold(gray, 253, 255, cv2.THRESH_BINARY_INV)

    # Find contours in the binary image
    contours, _ = cv2.findContours(thresh, cv2.RETR_EXTERNAL, cv2.CHAIN_APPROX_SIMPLE)

    # Create output directories if they don't exist
    # cropped_dir = Path(temp_dir) / 'cropped_dir'
    # masks_dir = cropped_dir

    # os.makedirs(cropped_dir, exist_ok=True)
    # os.makedirs(masks_dir, exist_ok=True)
    coord_info_list = []
    i = 0
    # Iterate through the contours and crop the regions
    for contour in contours:
        # Get the bounding box of the contour
        x, y, w, h = cv2.boundingRect(contour)
        if w < 256 or h < 256:  # Skip small contours
            continue

        # Crop the region
        cropped_img = img[y:y+h, x:x+w]
        
        messages = [
            {
                "role": "system",
                "content": [{"type": "text", "text": "You are a helpful assistant."}]
            },
            {
                "role": "user",
                "content": [
                    {"type": "image", "image": topil(cropped_img)},
                    {"type": "text", "text": "You are given a black-and-white line drawing as input. Please analyze the image carefully. If the drawing contains the majority of a head or face—meaning most key facial features or the overall shape of the head are visible—respond with 'True'. Otherwise, respond with 'False'. Do not contain any punctuation or extra spaces in your answer. Just respond with 'True' or 'False'"}
                ]
            }
        ]
        result = detect_heads(topil(cropped_img))
        if len(result) == 0:
            continue
        
        output = pipe(text=messages, max_new_tokens=200)
        if output[0]["generated_text"][-1]["content"] == 'False':
            # print(f"Skipping character {i+1} as it does not contain a head or face.")
            continue
        i += 1
        # Append the coordinates to the list
        coord_info_list.append((i,(x,y,w,h)))
    return coord_info_list