File size: 2,964 Bytes
99c1a8d
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
4566d18
99c1a8d
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
import gradio as gr
from autodistill_clip import CLIP
from autodistill_metaclip import MetaCLIP
from autodistill.detection import CaptionOntology
from PIL import Image
import tempfile

clip_model = CLIP(None)
metaclip_model = MetaCLIP(None)

# create side by side interface

def clip_model_interface(image, text):
    text = text + ", something else"

    with tempfile.NamedTemporaryFile(suffix=".jpg") as temp:
        image = Image.fromarray(image.astype("uint8"), "RGB")

        image.save(temp.name)

        ontology = CaptionOntology(
            {
                t: t for t in text.split(",")
            }
        )

        clip_model.ontology = ontology

        predictions = clip_model.predict(temp.name)

        labels = [text.split(",")[i] for i in predictions.class_id.tolist()]

        confidences = predictions.confidence.tolist()

        return {
            k: v for k, v in zip(labels, confidences)
        }

def metaclip_model_interface(image, text):
    text = text + ", something else"

    with tempfile.NamedTemporaryFile(suffix=".jpg") as temp:
        image = Image.fromarray(image.astype("uint8"), "RGB")

        image.save(temp.name)

        ontology = CaptionOntology(
            {
                t: t for t in text.split(",")
            }
        )

        metaclip_model.ontology = ontology

        predictions = metaclip_model.predict(temp.name, confidence=0)

        labels = [text.split(",")[i] for i in predictions.class_id.tolist()]

        confidences = predictions.confidence.tolist()

        return {
            k: v for k, v in zip(labels, confidences)
        }
    
def combined_model_interface(input_image, input_text):
    # Call the first function
    clip_output = clip_model_interface(input_image, input_text)
    
    # Call the second function
    metaclip_output = metaclip_model_interface(input_image, input_text)
    
    # Return the results from both functions as a tuple
    return clip_output, metaclip_output

inputs = [
    "image",
    "text"
]

outputs = [
    gr.outputs.Label(type="confidences", label="CLIP"),
    gr.outputs.Label(type="confidences", label="MetaCLIP")
]

title = "CLIP vs MetaCLIP"

description = """
CLIP is a zero-shot classification and embedding model developed by OpenAI.

MetaCLIP is a model that uses a CLIP architecture with an open dataset, developed by Meta AI.

Use this space to try out the models and see how they perform on your own images and text.

Note: Due to the way this space was implemented, CLIP will only return the top class. A fix is coming soon.

This project uses the following dependencies:

- [autodistill-clip](https://github.com/autodistill/autodistill-clip)
- [autodistill-metaclip](https://github.com/autodistill/autodistill-metaclip)
"""

gr.Interface(
    fn=combined_model_interface,
    inputs=inputs,
    outputs=outputs,
    title=title,
    description=description,
    allow_flagging=False,
    layout="vertical"
).launch()