cantremember's picture
more logging
7a6a83f
raw
history blame
1.41 kB
#!/usr/bin/env python3
from transformers.utils import logging
logging.set_verbosity_error()
from transformers import BlipForImageTextRetrieval
from transformers import AutoProcessor
from PIL import Image
import math, random, time
# import random
# import time
import torch
# multi-modal Model
# accepts both text and image content (or audio, etc.)
print "loading model ..."
model = BlipForImageTextRetrieval.from_pretrained("Salesforce/blip-itm-base-coco")
processor = AutoProcessor.from_pretrained("Salesforce/blip-itm-base-coco")
print "loading image ..."
raw_image = Image.open('./assets/pot-o-gold-my-little-pony-Derpy.jpeg').convert('RGB')
print "processing ..."
statements = [
"an image of a horse",
"a horse and a rainbow",
"a pony and a rainbow",
"a unicorn and a rainbow",
"a pony in a forest",
"a rainbox over a lake",
"a horse running through the forest",
"two eyes that do not match",
"equine joy",
"a stallion and gold coins",
"a mare and gold coins"
]
while True:
index = math.floor(random.random() * len(statements))
text = statements[index]
inputs = processor(images=raw_image,
text=text,
return_tensors="pt") # PyTorch tensors
itm_scores = model(**inputs)[0]
itm_score = torch.nn.functional.softmax(itm_scores, dim=1)
print(f"""'{text}' => {itm_score[0][1]:.2f}""")