Spaces:

ignitariumcloud
/

llama3.2

Sleeping

File size: 6,065 Bytes

36b757c
 
d654351
36b757c
 
 
 
81db3fd
ffb829e
36b757c
b73c970
d6c8e75
36b757c
d654351
36b757c
 
 
 
 
 
 
 
 
ce78321
 
 
 
 
 
ffb829e
ce78321
 
 
 
 
 
2d145da
ce78321
 
ffb829e
ce78321
2d145da
ecc5376
2d145da
 
 
 
 
 
ecc5376
2d145da
ecc5376
2d145da
ecc5376
2d145da
ecc5376
2d145da
 
 
32e7c60
2d145da
32e7c60
2d145da
 
32e7c60
 
2d145da
32e7c60
5d15a12
32e7c60
 
 
 
 
 
 
2d145da
 
 
 
 
a19c964
2d145da
32e7c60
a19c964
32e7c60
 
d6c8e75
2d145da

import torch
from PIL import Image
from transformers import AutoProcessor, AutoModelForPreTraining
import gradio as gr
import json
import traceback
import os
import re

model_name = "meta-llama/Llama-3.2-11B-Vision-Instruct"
token = os.getenv("HUGGINGFACE_TOKEN").strip()

processor = AutoProcessor.from_pretrained(model_name, token=token)
model = AutoModelForPreTraining.from_pretrained(
    model_name,
    quantization_config={"load_in_4bit": True},
    token=token
)

if torch.cuda.is_available():
    model = model.to('cuda')

def analyze_image(image, prompt):
    messages = [
        {"role": "user", "content": [
            {"type": "image"},
            {"type": "text", "text": prompt}
        ]}
    ]
    input_text = processor.apply_chat_template(messages, add_generation_prompt=True)
    inputs = processor(
        image,
        input_text,
        add_special_tokens=False,
        return_tensors="pt"
    ).to(model.device)
    
    with torch.no_grad():
        output = model.generate(**inputs, max_new_tokens=100)
    
    full_response = processor.decode(output[0])
    
    try:
        # Find all JSON-like structures in the response
        json_matches = list(re.finditer(r'\{.*?\}', full_response, re.DOTALL))
        
        if json_matches:
            # Take the last match
            last_json_str = json_matches[-1].group(0)
            try:
                processed_json = json.loads(last_json_str)
            except json.JSONDecodeError as e:
                processed_json = {"error": f"Invalid JSON in model output: {e}", "full_response": full_response}
        else:
            processed_json = {"error": "No JSON found in model output", "full_response": full_response}
    except Exception as e:
        processed_json = {"error": str(e), "full_response": full_response}
    
    return full_response, processed_json

default_prompt = """Analyze this image and determine if it contains a data logger. A data logger is typically a small, black electronic device used to monitor and record data over time, such as voltage, temperature, or current, via external sensors.

Carefully examine the image and provide a detailed response. If a data logger is present in the image, respond with:
{"present": true, "reason": "Detailed explanation of why you believe it's a data logger, including specific visual cues you've identified"}

If no data logger is visible, respond with:
{"present": false, "reason": "Detailed explanation of why you believe there's no data logger, describing what you see instead"}

Ensure your response is in valid JSON format """

iface = gr.Interface(
    fn=analyze_image,
    inputs=[
        gr.Image(type="pil", label="Upload Image"),
        gr.Textbox(label="Prompt", value=default_prompt, lines=10)
    ],
    outputs=[
        gr.Textbox(label="Full Response", lines=10),
        gr.JSON(label="Processed JSON")
    ],
    title="Llama 3.2 Vision",
    cache_examples=False,
    description=" ",
    examples=[
        ["bad.png", default_prompt]  
    ]
)

iface.launch()

# import torch
# from PIL import Image
# from transformers import AutoProcessor, AutoModelForPreTraining
# import gradio as gr
# import json
# import traceback
# import os
# import re

# model_name = "meta-llama/Llama-3.2-11B-Vision-Instruct"
# token = os.getenv("HUGGINGFACE_TOKEN").strip()

# processor = AutoProcessor.from_pretrained(model_name, token=token)
# model = AutoModelForPreTraining.from_pretrained(
#     model_name,
#     quantization_config={"load_in_4bit": True},
#     token=token
# )

# if torch.cuda.is_available():
#     model = model.to('cuda')

# def analyze_image(image, prompt):
#     messages = [
#         {"role": "user", "content": [
#             {"type": "image"},
#             {"type": "text", "text": prompt}
#         ]}
#     ]
    
#     input_text = processor.apply_chat_template(messages, add_generation_prompt=True)
#     inputs = processor(
#         image,
#         input_text,
#         add_special_tokens=False,
#         return_tensors="pt"
#     ).to(model.device)

#     with torch.no_grad():
#         output = model.generate(**inputs, max_new_tokens=100)
    
#     full_response = processor.decode(output[0])
#     print("Full response:", full_response)  # Debug print

#     # return full_response
#     try:
#         json_match = re.search(r'\{.*?\}', full_response, re.DOTALL)
#         if json_match:
#             json_str = json_match.group(0)
#             try:
#                 return json.loads(json_str)
#             except json.JSONDecodeError as e:
#                 print(f"JSON decode error: {e}")
#                 return {"error": "Invalid JSON in model output", "full_response": full_response}
#         else:
#             return {"error": "No JSON found in model output", "full_response": full_response}
#     except Exception as e:
#         print(f"Error in analyze_image: {e}")
#         return {"Full Response": str(e), "full_response": full_response}


# default_prompt = """Analyze this image and determine if it contains a data logger. 
# A data logger is typically a small, black electronic device used to monitor and record data 
# over time, such as voltage, temperature, or current, via external sensors.

# If a data logger is present in the image, respond with:
# {"present": true, "reason": "Brief explanation of why you believe it's a data logger"}

# If no data logger is visible, respond with:
# {"present": false, "reason": "Brief explanation of why you believe there's no data logger"}

# Ensure your response is in valid JSON format."""

# iface = gr.Interface(
#     fn=analyze_image,
#     inputs=[
#         gr.Image(type="pil", label="Upload Image"),
#         gr.Textbox(label="Prompt", value=default_prompt, lines=10)
#     ],
#     outputs=gr.JSON(label="Analysis Result"),
#     title="Data Logger Detection using Llama 3.2 Vision",
#     description="Upload an image and customize the prompt to check if it contains a data logger.",
#     examples=[
#         ["bad.png", default_prompt]
#     ]
# )

# iface.launch()