File size: 5,852 Bytes
8ea3c0d
70acf1a
 
 
8c71278
70acf1a
 
 
10d11c4
 
70acf1a
 
edb7051
 
 
70acf1a
edb7051
70acf1a
edb7051
 
 
 
 
 
 
 
 
 
 
 
 
 
 
70acf1a
edb7051
 
 
 
 
 
 
 
 
 
 
70acf1a
7b0a54f
 
 
 
 
70acf1a
7b0a54f
70acf1a
7b0a54f
 
 
 
 
70acf1a
7b0a54f
 
 
 
 
cd84313
7b0a54f
 
8ea3c0d
7b0a54f
0a89382
7b0a54f
 
70acf1a
4d44c1d
 
 
 
 
0a89382
4d44c1d
 
7b0a54f
4d44c1d
 
 
 
 
 
 
 
 
7b0a54f
 
70acf1a
4d44c1d
7b0a54f
 
4d44c1d
 
 
 
7b0a54f
 
70acf1a
4d44c1d
7b0a54f
4d44c1d
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
import gradio as gr
from transformers import AutoProcessor, Idefics2ForConditionalGeneration
import re
import time
from PIL import Image
import torch
import spaces
import subprocess
from peft import LoraConfig
from transformers import BitsAndBytesConfig
subprocess.run('pip install flash-attn --no-build-isolation', env={'FLASH_ATTENTION_SKIP_CUDA_BUILD': "TRUE"}, shell=True)

DEVICE = "cuda:0"
USE_LORA = False
USE_QLORA = True

processor = AutoProcessor.from_pretrained("HuggingFaceM4/idefics2-8b", do_image_splitting=False)

if USE_QLORA or USE_LORA:
    lora_config = LoraConfig(
        r=8,
        lora_alpha=8,
        lora_dropout=0.1,
        target_modules='.*(text_model|modality_projection|perceiver_resampler).*(down_proj|gate_proj|up_proj|k_proj|q_proj|v_proj|o_proj).*',
        use_dora=False if USE_QLORA else True,
        init_lora_weights="gaussian"
    )
    bnb_config = BitsAndBytesConfig(
        load_in_4bit=True,
        bnb_4bit_quant_type="nf4",
        bnb_4bit_compute_dtype=torch.float16
    ) if USE_QLORA else None
    model = Idefics2ForConditionalGeneration.from_pretrained(
        "HuggingFaceM4/idefics2-8b",
        torch_dtype=torch.float16,
        quantization_config=bnb_config,
    )
    model.add_adapter(lora_config)
    model.enable_adapters()
else:
    model = Idefics2ForConditionalGeneration.from_pretrained(
        "HuggingFaceM4/idefics2-8b",
        torch_dtype=torch.float16,
        _attn_implementation="flash_attention_2"
    ).to("cuda")

import gradio as gr
from huggingface_hub import InferenceApi
import base64
from PIL import Image
import io

client = InferenceApi("HuggingFaceM4/idefics2-8b")

def image_to_base64(image):
    buffered = io.BytesIO()
    image.save(buffered, format="JPEG")
    img_str = base64.b64encode(buffered.getvalue()).decode('utf-8')
    return img_str

def model_inference(image, text):
    image_base64 = image_to_base64(image)
    inputs = {
        "inputs": {
            "text": text,
            "image": image_base64
        }
    }
    
    result = client(inputs)
    print(result)
    generated_text = result['generated_text']
    return generated_text

with gr.Blocks(css="""
.input_image, .prompt_input {
    background-color: lightgrey;
}
""") as demo:
    gr.Markdown("## IDEFICS2 Demo")
    
    # Create a row with two columns of equal size
    with gr.Row():
        with gr.Column():
            # Labelled input fields
            image_input = gr.Image(label="Upload Image", type="pil", elem_classes=["input_image"]) #,height=240, width=320
            query_input = gr.Textbox(label="Enter Prompt", placeholder="Type your prompt here...", elem_classes=["prompt_input"])
        with gr.Column():
            # Output textbox
            output = gr.Textbox(label="Model Output", interactive=True, placeholder="Output will be displayed here...", elem_classes=["prompt_input"])
    
    # Button to submit the inputs for model inference
    submit_btn = gr.Button("Generate")
    submit_btn.click(model_inference, inputs=[image_input, query_input], outputs=output)

    # Example inputs for quick testing
    examples = [
        ["american_football.png", "Explain in detail what is depicted in the picture"],
        ["bike.png", "Describe in detail what you see in this image."],
        ["finance.png", "Describe in detail everything you see in the image."],
        ["science.png", "Extract all visible text in the image, keeping the format."],
        ["spirituality.png", "Extract all text from the image, preserving its format."]
    ]
    gr.Examples(examples=examples, inputs=[image_input, query_input], outputs=output)

# Launch the Gradio app with debugging enabled
demo.launch(debug=True)
    
# with gr.Blocks(css=".input_image {max-width: 100%; border: 1px solid #ccc; box-shadow: 0 0 10px #ccc; margin-bottom: 10px;} .output_textbox {min-height: 100px;}") as demo:
#     gr.Markdown("## Enhanced IDEFICS2 Demo")
#     with gr.Row():
#         with gr.Column(scale=1):
#             image_input = gr.Image(label="Upload Image", type="pil", height=240, width=320)
#             query_input = gr.Textbox(label="Enter Prompt", placeholder="Type your prompt here...")
#         with gr.Column(scale=1):
#             output = gr.Textbox(label="Model Output", interactive=True, placeholder="Output will be displayed here...")

#     submit_btn = gr.Button("Generate")
#     submit_btn.click(model_inference, inputs=[image_input, query_input], outputs=output)

#     examples = [
#         ["american_football.png", "Explain in detail what is depicted in the picture"],
#         ["bike.png", "Explore the image closely and describe in detail what you discover."],
#         ["finance.png", "Provide a detailed description of everything you see in the image."],
#         ["science.png", "Please perform optical character recognition (OCR) on the uploaded image. Extract all text visible in the image accurately. Ensure to capture the text in its entirety and maintain the formatting as closely as possible to how it appears in the image. After extracting the text, display it in a clear and readable format, making sure that any special characters or symbols are also accurately represented. Provide the extracted text as output."],
#         ["spirituality.png", "Please perform optical character recognition (OCR) on the uploaded image. Extract all text visible in the image accurately. Ensure to capture the text in its entirety and maintain the formatting as closely as possible to how it appears in the image. After extracting the text, display it in a clear and readable format, making sure that any special characters or symbols are also accurately represented. Provide the extracted text as output."]
#     ]
#     gr.Examples(examples=examples, inputs=[image_input, query_input], outputs=output)

# demo.launch(debug=True)