from typing import Optional from transformers import AutoTokenizer from transformers.processing_utils import ProcessorMixin from .image_processing_m2_encoder import M2EncoderImageProcessor class M2EncoderProcessor(ProcessorMixin): attributes = ["image_processor", "tokenizer"] image_processor_class = "M2EncoderImageProcessor" tokenizer_class = ("GLMChineseTokenizer", None) def __init__(self, image_processor, tokenizer): self.image_processor = image_processor self.tokenizer = tokenizer @classmethod def from_pretrained(cls, pretrained_model_name_or_path, **kwargs): trust_remote_code = kwargs.pop("trust_remote_code", True) image_processor = M2EncoderImageProcessor.from_pretrained( pretrained_model_name_or_path, **kwargs ) tokenizer = AutoTokenizer.from_pretrained( pretrained_model_name_or_path, trust_remote_code=trust_remote_code, **kwargs, ) return cls(image_processor=image_processor, tokenizer=tokenizer) def __call__( self, text=None, images=None, padding="max_length", truncation=True, max_length: Optional[int] = 52, return_tensors=None, **kwargs, ): encoding = {} if text is not None: encoding.update( self.tokenizer( text, padding=padding, truncation=truncation, max_length=max_length, return_special_tokens_mask=True, return_tensors=return_tensors, **kwargs, ) ) if images is not None: encoding.update( self.image_processor(images, return_tensors=return_tensors, **kwargs) ) return encoding