Spaces:
Runtime error
Runtime error
import numpy as np | |
import paddle | |
class VQATokenPad(object): | |
def __init__( | |
self, | |
max_seq_len=512, | |
pad_to_max_seq_len=True, | |
return_attention_mask=True, | |
return_token_type_ids=True, | |
truncation_strategy="longest_first", | |
return_overflowing_tokens=False, | |
return_special_tokens_mask=False, | |
infer_mode=False, | |
**kwargs | |
): | |
self.max_seq_len = max_seq_len | |
self.pad_to_max_seq_len = max_seq_len | |
self.return_attention_mask = return_attention_mask | |
self.return_token_type_ids = return_token_type_ids | |
self.truncation_strategy = truncation_strategy | |
self.return_overflowing_tokens = return_overflowing_tokens | |
self.return_special_tokens_mask = return_special_tokens_mask | |
self.pad_token_label_id = paddle.nn.CrossEntropyLoss().ignore_index | |
self.infer_mode = infer_mode | |
def __call__(self, data): | |
needs_to_be_padded = ( | |
self.pad_to_max_seq_len and len(data["input_ids"]) < self.max_seq_len | |
) | |
if needs_to_be_padded: | |
if "tokenizer_params" in data: | |
tokenizer_params = data.pop("tokenizer_params") | |
else: | |
tokenizer_params = dict( | |
padding_side="right", pad_token_type_id=0, pad_token_id=1 | |
) | |
difference = self.max_seq_len - len(data["input_ids"]) | |
if tokenizer_params["padding_side"] == "right": | |
if self.return_attention_mask: | |
data["attention_mask"] = [1] * len(data["input_ids"]) + [ | |
0 | |
] * difference | |
if self.return_token_type_ids: | |
data["token_type_ids"] = ( | |
data["token_type_ids"] | |
+ [tokenizer_params["pad_token_type_id"]] * difference | |
) | |
if self.return_special_tokens_mask: | |
data["special_tokens_mask"] = ( | |
data["special_tokens_mask"] + [1] * difference | |
) | |
data["input_ids"] = ( | |
data["input_ids"] + [tokenizer_params["pad_token_id"]] * difference | |
) | |
if not self.infer_mode: | |
data["labels"] = ( | |
data["labels"] + [self.pad_token_label_id] * difference | |
) | |
data["bbox"] = data["bbox"] + [[0, 0, 0, 0]] * difference | |
elif tokenizer_params["padding_side"] == "left": | |
if self.return_attention_mask: | |
data["attention_mask"] = [0] * difference + [1] * len( | |
data["input_ids"] | |
) | |
if self.return_token_type_ids: | |
data["token_type_ids"] = [ | |
tokenizer_params["pad_token_type_id"] | |
] * difference + data["token_type_ids"] | |
if self.return_special_tokens_mask: | |
data["special_tokens_mask"] = [1] * difference + data[ | |
"special_tokens_mask" | |
] | |
data["input_ids"] = [ | |
tokenizer_params["pad_token_id"] | |
] * difference + data["input_ids"] | |
if not self.infer_mode: | |
data["labels"] = [self.pad_token_label_id] * difference + data[ | |
"labels" | |
] | |
data["bbox"] = [[0, 0, 0, 0]] * difference + data["bbox"] | |
else: | |
if self.return_attention_mask: | |
data["attention_mask"] = [1] * len(data["input_ids"]) | |
for key in data: | |
if key in [ | |
"input_ids", | |
"labels", | |
"token_type_ids", | |
"bbox", | |
"attention_mask", | |
]: | |
if self.infer_mode: | |
if key != "labels": | |
length = min(len(data[key]), self.max_seq_len) | |
data[key] = data[key][:length] | |
else: | |
continue | |
data[key] = np.array(data[key], dtype="int64") | |
return data | |