|
--- |
|
license: apache-2.0 |
|
--- |
|
ONNX format of voxerality/rgb_language_cap model |
|
|
|
Model inference example: |
|
|
|
```python |
|
import onnxruntime as ort |
|
from transformers import AutoTokenizer,AutoImageProcessor |
|
from PIL import Image |
|
import numpy as np |
|
|
|
# load the ONNX models (encoder and decoder) |
|
encoder_onnx_path = 'models/rgb_language_cap_onnx/encoder_model.onnx' # load from local path |
|
decoder_onnx_path = 'models/rgb_language_cap_onnx/decoder_model.onnx' # load from local path |
|
encoder_session = ort.InferenceSession(encoder_onnx_path, providers=["CPUExecutionProvider"]) |
|
decoder_session = ort.InferenceSession(decoder_onnx_path, providers=["CPUExecutionProvider"]) |
|
|
|
# load the tokenizer and image processor |
|
model_id = "models/rgb_language_cap_onnx" |
|
processor = AutoImageProcessor.from_pretrained(model_id) |
|
tokenizer = AutoTokenizer.from_pretrained(model_id) |
|
|
|
# load image |
|
image_path = "img2.jpg" |
|
image = Image.open(image_path) |
|
inputs = processor(images=image, return_tensors="np").pixel_values |
|
|
|
# run encoder model |
|
encoder_outputs = encoder_session.run( |
|
None, |
|
{"pixel_values": inputs} |
|
) |
|
|
|
# extract the encoder hidden states (encoder outputs) |
|
encoder_hidden_states = encoder_outputs[0] |
|
|
|
# prepare decoder inputs |
|
decoder_input_ids = np.array([[tokenizer.bos_token_id]], dtype=np.int64) |
|
|
|
# run decoder model |
|
max_length = 200 # define maximum length of the sequence |
|
|
|
for _ in range(max_length): |
|
decoder_outputs = decoder_session.run( |
|
None, |
|
{ |
|
"input_ids": decoder_input_ids, # input for the decoder |
|
"encoder_hidden_states": encoder_hidden_states # outputs from the encoder |
|
} |
|
) |
|
|
|
# extract logits and predict next token |
|
logits = decoder_outputs[0] |
|
predicted_token_id = np.argmax(logits[0, -1, :]) # get the predicted token ID from the logits |
|
|
|
# if the predicted token is the EOS token, stop the generation |
|
if predicted_token_id == tokenizer.eos_token_id: |
|
break |
|
|
|
# append predicted token ID to the decoder inputs for the next step |
|
decoder_input_ids = np.concatenate([decoder_input_ids, np.array([[predicted_token_id]])], axis=-1) |
|
|
|
# decode the predicted token IDs into text |
|
predicted_text = tokenizer.decode(decoder_input_ids[0], skip_special_tokens=True) |
|
|
|
# print the generated caption |
|
print(predicted_text) |
|
``` |