File size: 3,351 Bytes
95e2523
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
from typing import List, Union
from PIL import Image, ImageDraw
import torch

from diffusers.modular_pipelines import (
    PipelineState,
    ModularPipelineBlocks,
    InputParam,
    ComponentSpec,
    OutputParam,
)
from transformers import AutoProcessor, AutoModelForCausalLM


class Florence2ImageAnnotatorBlock(ModularPipelineBlocks):
    @property
    def expected_components(self):
        return [
            ComponentSpec(
                name="image_annotator",
                type_hint=AutoModelForCausalLM,
                repo="microsoft/Florence-2-large",
            ),
            ComponentSpec(
                name="image_annotator_processor",
                type_hint=AutoProcessor,
                repo="microsoft/Florence-2-large",
            ),
        ]

    @property
    def inputs(self) -> List[InputParam]:
        return [
            InputParam(
                "image",
                Image,
                required=True,
                description="Image(s) to annotate",
            ),
            InputParam(
                "annotation_task_prompt",
                Union[str, List[str]],
                required=True,
                description="""Annotation Task to perform on the image.
                """,
            ),
        ]

    @property
    def intermediates_outputs(self) -> List[OutputParam]:
        return [
            OutputParam(
                "mask",
                type_hint=torch.Tensor,
                description="Depth Map(s) of input Image(s)",
            ),
        ]

    def annotate_image(self, image, prompt):
        inputs = self.image_annotator_processor(
            text=prompt, images=image, return_tensors="pt"
        )
        generated_ids = self.annotator.generate(
            input_ids=inputs["input_ids"],
            pixel_values=inputs["pixel_values"],
            max_new_tokens=1024,
            early_stopping=False,
            do_sample=False,
            num_beams=3,
        )
        annotations = self.image_annotator_processor.batch_decode(
            generated_ids, skip_special_tokens=False
        )[0]
        annotations = self.image_annotator_processor.post_process_generation(
            annotations, task=prompt, image_size=(image.height, image.width)
        )

        return annotations

    def prepare_mask(self, images, annotations):
        masks = []
        for image, annotation in zip(images, annotations):
            mask_image = Image.new("L", image.size, 0)
            draw = ImageDraw.Draw(mask_image)
            draw.polygon(annotation["polygon"], fill="white")
            masks.append(mask_image)

        return masks

    @torch.no_grad()
    def __call__(self, pipeline, state: PipelineState) -> PipelineState:
        block_state = self.get_block_state(state)

        images = block_state.image
        annotation_task_prompt = block_state.annotation_task_prompt

        if not isinstance(annotation_task_prompt, list):
            annotation_task_prompt = [annotation_task_prompt]

        if len(images) != len(annotation_task_prompt):
            raise ValueError("Number of images and annotation prompts must match")

        annotations = self.annotate_image(images, annotation_task_prompt)
        block_state.mask = self.prepare_mask(images, annotations)

        self.set_block_state(block_state)