anmoldograpsl commited on
Commit
036580a
·
verified ·
1 Parent(s): 3aec62f

Update app.py

Browse files
Files changed (1) hide show
  1. app.py +40 -48
app.py CHANGED
@@ -1,50 +1,42 @@
1
-
2
  import os
3
- import gradio as gr
 
4
  from PIL import Image
5
- from transformers import BlipProcessor, BlipForConditionalGeneration
6
- from peft import get_peft_model, LoraConfig, TaskType
7
- from huggingface_hub import login
8
-
9
- # Step 1: Log in to Hugging Face
10
- hf_token = os.getenv("HF_TOKEN")
11
- login(token=hf_token)
12
-
13
- # Step 2: Load the private model and processor
14
- model_name = "anushettypsl/paligemma_vqav2" # Replace with the actual model link
15
- processor = BlipProcessor.from_pretrained(model_name)
16
- base_model = BlipForConditionalGeneration.from_pretrained(model_name)
17
-
18
- # Step 3: Set up PEFT configuration (if needed)
19
- lora_config = LoraConfig(
20
- r=16, # Rank
21
- lora_alpha=32, # Scaling factor
22
- lora_dropout=0.1, # Dropout
23
- task_type=TaskType.VISUAL_QUESTION_ANSWERING, # Adjust according to your model's task
24
- )
25
-
26
- # Step 4: Get the PEFT model
27
- peft_model = get_peft_model(base_model, lora_config)
28
-
29
- # Step 5: Define the prediction function
30
- def predict(image):
31
- # Preprocess the image
32
- image = processor(image, return_tensors="pt").pixel_values
33
- # Generate output using the model
34
- with torch.no_grad():
35
- output = peft_model.generate(image)
36
- # Decode the output to text
37
- generated_text = processor.decode(output[0], skip_special_tokens=True)
38
- return generated_text
39
-
40
- # Step 6: Create the Gradio interface
41
- interface = gr.Interface(
42
- fn=predict,
43
- inputs=gr.Image(type="pil"), # Image input
44
- outputs="text", # Text output
45
- title="Image-to-Text Model",
46
- description="Upload an image to generate a descriptive text."
47
- )
48
-
49
- # Step 7: Launch the Gradio app
50
- interface.launch()
 
1
+ from huggingface_hub import login
2
  import os
3
+ from peft import PeftModel, PeftConfig
4
+ from transformers import AutoProcessor, PaliGemmaForConditionalGeneration
5
  from PIL import Image
6
+ import requests
7
+ import torch
8
+ import io
9
+ import base64
10
+ import cv2
11
+
12
+ access_token = os.environ["HF_TOKEN"]
13
+ login(token=access_token)
14
+
15
+ device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
16
+ dtype = torch.bfloat16
17
+
18
+ config = PeftConfig.from_pretrained("anushettypsl/paligemma_vqav2")
19
+ # base_model = AutoModelForCausalLM.from_pretrained("google/paligemma-3b-pt-448")
20
+ base_model = PaliGemmaForConditionalGeneration.from_pretrained("google/paligemma-3b-pt-448")
21
+ model = PeftModel.from_pretrained(base_model, "anushettypsl/paligemma_vqav2", device_map=device)
22
+ processor = AutoProcessor.from_pretrained("google/paligemma-3b-pt-448", device_map=device)
23
+ model.to(device)
24
+
25
+ image = cv2.imread('/content/15_BC_G2_6358_40x_2_jpg.rf.97595fa4965f66ad45be8fd055331933.jpg')
26
+
27
+ # Convert the image to base64 encoding
28
+ image_bytes = cv2.imencode('.jpg', image)[1]
29
+ base64_string = base64.b64encode(image_bytes).decode('utf-8')
30
+
31
+ input_image = Image.open(io.BytesIO(image_bytes)).convert('RGB')
32
+
33
+ model_inputs = processor(
34
+ text=input_text, images=input_image, return_tensors="pt").to(device)
35
+ input_len = model_inputs["input_ids"].shape[-1]
36
+ model.to(device)
37
+ with torch.inference_mode():
38
+ generation = model.generate(
39
+ **model_inputs, max_new_tokens=100, do_sample=False)
40
+ generation = generation[0][input_len:]
41
+ decoded = processor.decode(generation, skip_special_tokens=True)
42
+ print(decoded)