File size: 6,065 Bytes
36b757c
 
d654351
36b757c
 
 
 
81db3fd
ffb829e
36b757c
b73c970
d6c8e75
36b757c
d654351
36b757c
 
 
 
 
 
 
 
 
ce78321
 
 
 
 
 
ffb829e
ce78321
 
 
 
 
 
2d145da
ce78321
 
ffb829e
ce78321
2d145da
ecc5376
2d145da
 
 
 
 
 
ecc5376
2d145da
ecc5376
2d145da
ecc5376
2d145da
ecc5376
2d145da
 
 
32e7c60
2d145da
32e7c60
2d145da
 
32e7c60
 
2d145da
32e7c60
5d15a12
32e7c60
 
 
 
 
 
 
2d145da
 
 
 
 
a19c964
2d145da
32e7c60
a19c964
32e7c60
 
d6c8e75
2d145da
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
import torch
from PIL import Image
from transformers import AutoProcessor, AutoModelForPreTraining
import gradio as gr
import json
import traceback
import os
import re

model_name = "meta-llama/Llama-3.2-11B-Vision-Instruct"
token = os.getenv("HUGGINGFACE_TOKEN").strip()

processor = AutoProcessor.from_pretrained(model_name, token=token)
model = AutoModelForPreTraining.from_pretrained(
    model_name,
    quantization_config={"load_in_4bit": True},
    token=token
)

if torch.cuda.is_available():
    model = model.to('cuda')

def analyze_image(image, prompt):
    messages = [
        {"role": "user", "content": [
            {"type": "image"},
            {"type": "text", "text": prompt}
        ]}
    ]
    input_text = processor.apply_chat_template(messages, add_generation_prompt=True)
    inputs = processor(
        image,
        input_text,
        add_special_tokens=False,
        return_tensors="pt"
    ).to(model.device)
    
    with torch.no_grad():
        output = model.generate(**inputs, max_new_tokens=100)
    
    full_response = processor.decode(output[0])
    
    try:
        # Find all JSON-like structures in the response
        json_matches = list(re.finditer(r'\{.*?\}', full_response, re.DOTALL))
        
        if json_matches:
            # Take the last match
            last_json_str = json_matches[-1].group(0)
            try:
                processed_json = json.loads(last_json_str)
            except json.JSONDecodeError as e:
                processed_json = {"error": f"Invalid JSON in model output: {e}", "full_response": full_response}
        else:
            processed_json = {"error": "No JSON found in model output", "full_response": full_response}
    except Exception as e:
        processed_json = {"error": str(e), "full_response": full_response}
    
    return full_response, processed_json

default_prompt = """Analyze this image and determine if it contains a data logger. A data logger is typically a small, black electronic device used to monitor and record data over time, such as voltage, temperature, or current, via external sensors.

Carefully examine the image and provide a detailed response. If a data logger is present in the image, respond with:
{"present": true, "reason": "Detailed explanation of why you believe it's a data logger, including specific visual cues you've identified"}

If no data logger is visible, respond with:
{"present": false, "reason": "Detailed explanation of why you believe there's no data logger, describing what you see instead"}

Ensure your response is in valid JSON format """

iface = gr.Interface(
    fn=analyze_image,
    inputs=[
        gr.Image(type="pil", label="Upload Image"),
        gr.Textbox(label="Prompt", value=default_prompt, lines=10)
    ],
    outputs=[
        gr.Textbox(label="Full Response", lines=10),
        gr.JSON(label="Processed JSON")
    ],
    title="Llama 3.2 Vision",
    cache_examples=False,
    description=" ",
    examples=[
        ["bad.png", default_prompt]  
    ]
)

iface.launch()

# import torch
# from PIL import Image
# from transformers import AutoProcessor, AutoModelForPreTraining
# import gradio as gr
# import json
# import traceback
# import os
# import re

# model_name = "meta-llama/Llama-3.2-11B-Vision-Instruct"
# token = os.getenv("HUGGINGFACE_TOKEN").strip()

# processor = AutoProcessor.from_pretrained(model_name, token=token)
# model = AutoModelForPreTraining.from_pretrained(
#     model_name,
#     quantization_config={"load_in_4bit": True},
#     token=token
# )

# if torch.cuda.is_available():
#     model = model.to('cuda')

# def analyze_image(image, prompt):
#     messages = [
#         {"role": "user", "content": [
#             {"type": "image"},
#             {"type": "text", "text": prompt}
#         ]}
#     ]
    
#     input_text = processor.apply_chat_template(messages, add_generation_prompt=True)
#     inputs = processor(
#         image,
#         input_text,
#         add_special_tokens=False,
#         return_tensors="pt"
#     ).to(model.device)

#     with torch.no_grad():
#         output = model.generate(**inputs, max_new_tokens=100)
    
#     full_response = processor.decode(output[0])
#     print("Full response:", full_response)  # Debug print

#     # return full_response
#     try:
#         json_match = re.search(r'\{.*?\}', full_response, re.DOTALL)
#         if json_match:
#             json_str = json_match.group(0)
#             try:
#                 return json.loads(json_str)
#             except json.JSONDecodeError as e:
#                 print(f"JSON decode error: {e}")
#                 return {"error": "Invalid JSON in model output", "full_response": full_response}
#         else:
#             return {"error": "No JSON found in model output", "full_response": full_response}
#     except Exception as e:
#         print(f"Error in analyze_image: {e}")
#         return {"Full Response": str(e), "full_response": full_response}


# default_prompt = """Analyze this image and determine if it contains a data logger. 
# A data logger is typically a small, black electronic device used to monitor and record data 
# over time, such as voltage, temperature, or current, via external sensors.

# If a data logger is present in the image, respond with:
# {"present": true, "reason": "Brief explanation of why you believe it's a data logger"}

# If no data logger is visible, respond with:
# {"present": false, "reason": "Brief explanation of why you believe there's no data logger"}

# Ensure your response is in valid JSON format."""

# iface = gr.Interface(
#     fn=analyze_image,
#     inputs=[
#         gr.Image(type="pil", label="Upload Image"),
#         gr.Textbox(label="Prompt", value=default_prompt, lines=10)
#     ],
#     outputs=gr.JSON(label="Analysis Result"),
#     title="Data Logger Detection using Llama 3.2 Vision",
#     description="Upload an image and customize the prompt to check if it contains a data logger.",
#     examples=[
#         ["bad.png", default_prompt]
#     ]
# )

# iface.launch()