import onnxruntime as ort from transformers import AutoTokenizer import numpy as np import os print("my env", os.environ["XLNX_VART_FIRMWARE"]) onnx_path = r"C:\Users\Felix\Olive\examples\gpt2\cache\models\1_VitisAIQuantization-1193226590a636c107851db60c66899c-ebec96f9d75c46bed8dc01c8240c6bad-cpu-cpu\output_model\model.onnx" config_path = r"C:\Users\Felix\Downloads\voe-3.5-win_amd64\voe-3.5-win_amd64\vaip_config.json" # onnx_path = r"C:\Users\Felix\Downloads\voe-3.5-win_amd64\voe-3.5-win_amd64\Examples\resnet50_python\models\ResNet\ResNet_int.onnx" tokenizer = AutoTokenizer.from_pretrained("hf-internal-testing/tiny-random-gpt2") tokenizer.pad_token_id = tokenizer.eos_token_id tokenizer.padding_side = "left" session = ort.InferenceSession(onnx_path, providers=['VitisAIExecutionProvider'], provider_options=[{'config_file': config_path}]) print("after load") inps = tokenizer("Hey hey hey! This is me and", return_tensors="np", padding="max_length", max_length=128) inputs = { "input_ids": inps["input_ids"].astype(np.int32), "attention_mask": inps["attention_mask"].astype(np.int32), "position_ids": np.arange(inps["attention_mask"].shape[1], dtype=np.int32)[None, :] } result = session.run(None, inputs) res_logits = result[0] res_logits = np.argmax(res_logits, axis=-1) print(tokenizer.batch_decode(res_logits)) inps = tokenizer("Hey hey hey! This is me and I love to", return_tensors="np", padding="max_length", max_length=128) inputs = { "input_ids": inps["input_ids"].astype(np.int32), "attention_mask": inps["attention_mask"].astype(np.int32), "position_ids": np.arange(inps["attention_mask"].shape[1], dtype=np.int32)[None, :] } result = session.run(None, inputs) res_logits = result[0] res_logits = np.argmax(res_logits, axis=-1) print(tokenizer.batch_decode(res_logits))