Make infer general, so it runs on non cuda devices

Infer hard codes .cuda(), which does not allow it to run on CPU or other devices, instead use self.device so it can run on CPU, MPS etc.

Files changed (1) hide show

modeling_deepseekocr.py +365 -257

modeling_deepseekocr.py CHANGED Viewed

@@ -1,6 +1,9 @@
 from .modeling_deepseekv2 import DeepseekV2Model, DeepseekV2ForCausalLM
 from .configuration_deepseek_v2 import DeepseekV2Config
-from transformers.modeling_outputs import BaseModelOutputWithPast, CausalLMOutputWithPast
 from typing import List, Optional, Tuple, Union
 from transformers.cache_utils import Cache
 import requests
@@ -25,14 +28,13 @@ import time
 def load_image(image_path):
     try:
         image = Image.open(image_path)
         corrected_image = ImageOps.exif_transpose(image)
         return corrected_image
     except Exception as e:
         print(f"error: {e}")
         try:
@@ -42,7 +44,7 @@ def load_image(image_path):
 def re_match(text):
-    pattern = r'(<\|ref\|>(.*?)<\|/ref\|><\|det\|>(.*?)<\|/det\|>)'
     matches = re.findall(pattern, text, re.DOTALL)
     # pattern1 = r'<\|ref\|>.*?<\|/ref\|>\n'
@@ -51,7 +53,7 @@ def re_match(text):
     mathes_image = []
     mathes_other = []
     for a_match in matches:
-        if '<|ref|>image<|/ref|>' in a_match[0]:
             mathes_image.append(a_match[0])
         else:
             mathes_other.append(a_match[0])
@@ -59,7 +61,6 @@ def re_match(text):
 def extract_coordinates_and_label(ref_text, image_width, image_height):
     try:
         label_type = ref_text[1]
         cor_list = eval(ref_text[2])
@@ -71,33 +72,36 @@ def extract_coordinates_and_label(ref_text, image_width, image_height):
 def draw_bounding_boxes(image, refs, ouput_path):
     image_width, image_height = image.size
     img_draw = image.copy()
     draw = ImageDraw.Draw(img_draw)
-    overlay = Image.new('RGBA', img_draw.size, (0, 0, 0, 0))
     draw2 = ImageDraw.Draw(overlay)
     # try:
     # except IOError:
     #     try:
-    #         font = ImageFont.truetype("DejaVuSans.ttf", 20)
     #     except IOError:
     font = ImageFont.load_default()
     img_idx = 0
     for i, ref in enumerate(refs):
         try:
             result = extract_coordinates_and_label(ref, image_width, image_height)
             if result:
                 label_type, points_list = result
-                color = (np.random.randint(0, 200), np.random.randint(0, 200), np.random.randint(0, 255))
-                color_a = color + (20, )
                 for points in points_list:
                     x1, y1, x2, y2 = points
@@ -107,7 +111,7 @@ def draw_bounding_boxes(image, refs, ouput_path):
                     x2 = int(x2 / 999 * image_width)
                     y2 = int(y2 / 999 * image_height)
-                    if label_type == 'image':
                         try:
                             cropped = image.crop((x1, y1, x2, y2))
                             cropped.save(f"{ouput_path}/images/{img_idx}.jpg")
@@ -115,24 +119,35 @@ def draw_bounding_boxes(image, refs, ouput_path):
                             print(e)
                             pass
                         img_idx += 1
                     try:
-                        if label_type == 'title':
                             draw.rectangle([x1, y1, x2, y2], outline=color, width=4)
-                            draw2.rectangle([x1, y1, x2, y2], fill=color_a, outline=(0, 0, 0, 0), width=1)
                         else:
                             draw.rectangle([x1, y1, x2, y2], outline=color, width=2)
-                            draw2.rectangle([x1, y1, x2, y2], fill=color_a, outline=(0, 0, 0, 0), width=1)
                         text_x = x1
                         text_y = max(0, y1 - 15)
                         text_bbox = draw.textbbox((0, 0), label_type, font=font)
                         text_width = text_bbox[2] - text_bbox[0]
                         text_height = text_bbox[3] - text_bbox[1]
-                        draw.rectangle([text_x, text_y, text_x + text_width, text_y + text_height],
-                                    fill=(255, 255, 255, 30))
                         draw.text((text_x, text_y), label_type, font=font, fill=color)
                     except:
                         pass
@@ -143,17 +158,13 @@ def draw_bounding_boxes(image, refs, ouput_path):
 def process_image_with_refs(image, ref_texts, output_path):
     result_image = draw_bounding_boxes(image, ref_texts, output_path)
-    return result_image
 def find_closest_aspect_ratio(aspect_ratio, target_ratios, width, height, image_size):
-    best_ratio_diff = float('inf')
     best_ratio = (1, 1)
     area = width * height
     for ratio in target_ratios:
@@ -169,20 +180,27 @@ def find_closest_aspect_ratio(aspect_ratio, target_ratios, width, height, image_
     return best_ratio
-def dynamic_preprocess(image, min_num=2, max_num=9, image_size=640, use_thumbnail=False):
     orig_width, orig_height = image.size
     aspect_ratio = orig_width / orig_height
     # calculate the existing image aspect ratio
     target_ratios = set(
-        (i, j) for n in range(min_num, max_num + 1) for i in range(1, n + 1) for j in range(1, n + 1) if
-        i * j <= max_num and i * j >= min_num)
     # print(target_ratios)
     target_ratios = sorted(target_ratios, key=lambda x: x[0] * x[1])
     # find the closest aspect ratio to the target
     target_aspect_ratio = find_closest_aspect_ratio(
-        aspect_ratio, target_ratios, orig_width, orig_height, image_size)
     # print(target_aspect_ratio)
     # calculate the target width and height
@@ -198,7 +216,7 @@ def dynamic_preprocess(image, min_num=2, max_num=9, image_size=640, use_thumbnai
             (i % (target_width // image_size)) * image_size,
             (i // (target_width // image_size)) * image_size,
             ((i % (target_width // image_size)) + 1) * image_size,
-            ((i // (target_width // image_size)) + 1) * image_size
         )
         # split the image
         split_img = resized_img.crop(box)
@@ -210,15 +228,14 @@ def dynamic_preprocess(image, min_num=2, max_num=9, image_size=640, use_thumbnai
     return processed_images, target_aspect_ratio
 def normalize_transform(mean, std):
     if mean is None and std is None:
         transform = None
     elif mean is None and std is not None:
-        mean = [0.] * len(std)
         transform = transforms.Normalize(mean=mean, std=std)
     elif mean is not None and std is None:
-        std = [1.] * len(mean)
         transform = transforms.Normalize(mean=mean, std=std)
     else:
         transform = transforms.Normalize(mean=mean, std=std)
@@ -226,11 +243,10 @@ def normalize_transform(mean, std):
     return transform
 def format_messages(
-        conversations: List[Dict[str, str]],
-        sft_format: str = "deepseek",
-        system_prompt: str = "",
 ):
     """
     Applies the SFT template to conversation.
@@ -264,6 +280,7 @@ def text_encode(tokenizer, text: str, bos: bool = True, eos: bool = False):
     return t
 def load_pil_images(conversations: List[Dict[str, str]]) -> List[Image.Image]:
     """
@@ -294,7 +311,7 @@ def load_pil_images(conversations: List[Dict[str, str]]) -> List[Image.Image]:
             # print(image_path)
             # print('----------------')
             # exit()
             # pil_img = Image.open(image_path)
             pil_img = load_image(image_path)
             pil_img = pil_img.convert("RGB")
@@ -304,7 +321,6 @@ def load_pil_images(conversations: List[Dict[str, str]]) -> List[Image.Image]:
 class BaseTransform(ABC):
     def set_rng(self, *args, **kwargs):
         pass
@@ -318,32 +334,32 @@ class BaseTransform(ABC):
 class BasicImageTransform(BaseTransform):
     def __init__(
-        self,
         mean: Optional[Tuple[float, float, float]] = (0.5, 0.5, 0.5),
         std: Optional[Tuple[float, float, float]] = (0.5, 0.5, 0.5),
-        normalize: bool = True
     ):
         self.mean = mean
         self.std = std
-        transform_pipelines = [
-            transforms.ToTensor()
-        ]
         normalize = normalize_transform(mean, std) if normalize else nn.Identity()
         if normalize is not None:
             transform_pipelines.append(normalize)
         self.transform = transforms.Compose(transform_pipelines)
     def __call__(self, x):
         x = self.transform(x)
         return x
 class NoEOSTextStreamer(TextStreamer):
     def on_finalized_text(self, text: str, stream_end: bool = False):
-        eos_text = self.tokenizer.decode([self.tokenizer.eos_token_id], skip_special_tokens=False)
         text = text.replace(eos_text, "\n")
         print(text, flush=True, end="")
@@ -351,6 +367,7 @@ class NoEOSTextStreamer(TextStreamer):
 class DeepseekOCRConfig(DeepseekV2Config):
     model_type = "DeepseekOCR"
 class DeepseekOCRModel(DeepseekV2Model):
     config_class = DeepseekOCRConfig
@@ -361,14 +378,13 @@ class DeepseekOCRModel(DeepseekV2Model):
         self.vision_model = build_clip_l()
         # self.conv_2 = nn.Conv2d(in_channels=1024, out_channels=2048, kernel_size=2, stride=2)
         n_embed = 1280
-        self.projector =  MlpProjector(Dict(projector_type="linear", input_dim=2048, n_embed=n_embed))
         embed_std = 1 / torch.sqrt(torch.tensor(n_embed, dtype=torch.float32))
         self.image_newline = nn.Parameter(torch.randn(n_embed) * embed_std)
         self.view_seperator = nn.Parameter(torch.randn(n_embed) * embed_std)
     def forward(
         self,
         input_ids: torch.LongTensor = None,
@@ -384,28 +400,23 @@ class DeepseekOCRModel(DeepseekV2Model):
         images_spatial_crop: Optional[torch.FloatTensor] = None,
         return_dict: Optional[bool] = None,
     ) -> Union[Tuple, BaseModelOutputWithPast]:
         if inputs_embeds is None:
             # inputs_embeds = self.embed_tokens(input_ids)
             inputs_embeds = self.get_input_embeddings()(input_ids)
-        sam_model = getattr(self, 'sam_model', None)
         # sam_model = self.sam_model
-        vision_model = getattr(self, 'vision_model', None)
-        if sam_model is not None and (input_ids.shape[1] != 1 or self.training) and torch.sum(images[0][1]).item() != 0:
             idx = 0
             # sam_model = torch.jit.script(sam_model)
             # start_time = time.time()
             for image, crop_shape in zip(images, images_spatial_crop):
                 images_in_this_batch = []
@@ -414,53 +425,86 @@ class DeepseekOCRModel(DeepseekV2Model):
                 image_ori = image[1]
                 with torch.no_grad():
-                # with torch.inference_mode():
                     if torch.sum(patches).item() != 0:
                         # P, C, H, W = patches.shape
                         crop_flag = 1
                         local_features_1 = sam_model(patches)
-                        local_features_2 = vision_model(patches, local_features_1)
                         # vit_time = time.time()
-                        local_features = torch.cat((local_features_2[:, 1:], local_features_1.flatten(2).permute(0, 2, 1)), dim=-1)
                         local_features = self.projector(local_features)
                         global_features_1 = sam_model(image_ori)
-                        global_features_2 = vision_model(image_ori, global_features_1)
-                        global_features = torch.cat((global_features_2[:, 1:], global_features_1.flatten(2).permute(0, 2, 1)), dim=-1)
                         global_features = self.projector(global_features)
-                        print('=====================')
-                        print('BASE: ', global_features.shape)
-                        print('PATCHES: ', local_features.shape)
-                        print('=====================')
                         _, hw, n_dim = global_features.shape
-                        h = w = int(hw ** 0.5)
                         _2, hw2, n_dim2 = local_features.shape
-                        h2 = w2 = int(hw2 ** 0.5)
                         width_crop_num, height_crop_num = crop_shape[0], crop_shape[1]
                         global_features = global_features.view(h, w, n_dim)
                         global_features = torch.cat(
-                            [global_features, self.image_newline[None, None, :].expand(h, 1, n_dim)], dim=1
                         )
                         global_features = global_features.view(-1, n_dim)
-                        local_features = local_features.view(height_crop_num, width_crop_num, h2, w2, n_dim2).permute(0, 2, 1, 3, 4).reshape(height_crop_num*h2, width_crop_num*w2, n_dim2)
                         local_features = torch.cat(
-                            [local_features, self.image_newline[None, None, :].expand(height_crop_num * h2, 1, n_dim2)], dim=1
                         )
                         local_features = local_features.view(-1, n_dim2)
-                        global_local_features = torch.cat([local_features, global_features, self.view_seperator[None, :]], dim=0)
                         # end_time = time.time()
@@ -469,32 +513,42 @@ class DeepseekOCRModel(DeepseekV2Model):
                         # print('all: ', end_time - start_time)
                         # exit()
                     else:
                         global_features_1 = sam_model(image_ori)
-                        global_features_2 = vision_model(image_ori, global_features_1)
-                        global_features = torch.cat((global_features_2[:, 1:], global_features_1.flatten(2).permute(0, 2, 1)), dim=-1)
                         global_features = self.projector(global_features)
-                        print('=====================')
-                        print('BASE: ', global_features.shape)
-                        print('NO PATCHES')
-                        print('=====================')
                         _, hw, n_dim = global_features.shape
-                        h = w = int(hw ** 0.5)
                         global_features = global_features.view(h, w, n_dim)
                         global_features = torch.cat(
-                            [global_features, self.image_newline[None, None, :].expand(h, 1, n_dim)], dim=1
                         )
                         global_features = global_features.view(-1, n_dim)
-                        global_local_features = torch.cat([global_features, self.view_seperator[None, :]], dim=0)
                     images_in_this_batch.append(global_local_features)
                 # print(inputs_embeds.shape)
@@ -502,21 +556,27 @@ class DeepseekOCRModel(DeepseekV2Model):
                     images_in_this_batch = torch.cat(images_in_this_batch, dim=0)
                     # exit()
-                    inputs_embeds[idx].masked_scatter_(images_seq_mask[idx].unsqueeze(-1).cuda(), images_in_this_batch)
                 idx += 1
         return super(DeepseekOCRModel, self).forward(
-            input_ids=None, attention_mask=attention_mask, past_key_values=past_key_values,
-            inputs_embeds=inputs_embeds, use_cache=use_cache, position_ids = position_ids,
-            output_attentions=output_attentions, output_hidden_states=output_hidden_states,
-            return_dict=return_dict
         )
-class DeepseekOCRForCausalLM(DeepseekV2ForCausalLM):
     config_class = DeepseekOCRConfig
     # supports_gradient_checkpointing = True
@@ -536,7 +596,6 @@ class DeepseekOCRForCausalLM(DeepseekV2ForCausalLM):
     def get_model(self):
         return self.model
     def forward(
         self,
         input_ids: torch.LongTensor = None,
@@ -552,17 +611,22 @@ class DeepseekOCRForCausalLM(DeepseekV2ForCausalLM):
         images_seq_mask: Optional[torch.FloatTensor] = None,
         images_spatial_crop: Optional[torch.FloatTensor] = None,
         return_dict: Optional[bool] = None,
     ) -> Union[Tuple, CausalLMOutputWithPast]:
-        output_attentions = output_attentions if output_attentions is not None else self.config.output_attentions
         output_hidden_states = (
-            output_hidden_states if output_hidden_states is not None else self.config.output_hidden_states
         )
-        return_dict = return_dict if return_dict is not None else self.config.use_return_dict
-        outputs  = self.model(
             input_ids=input_ids,
             past_key_values=past_key_values,
             attention_mask=attention_mask,
@@ -572,14 +636,11 @@ class DeepseekOCRForCausalLM(DeepseekV2ForCausalLM):
             output_attentions=output_attentions,
             output_hidden_states=output_hidden_states,
             images=images,
-            images_seq_mask = images_seq_mask,
-            images_spatial_crop = images_spatial_crop,
-            return_dict=return_dict
         )
         # print(transformer_outputs)
         hidden_states = outputs[0]
@@ -613,9 +674,13 @@ class DeepseekOCRForCausalLM(DeepseekV2ForCausalLM):
             attentions=outputs.attentions,
         )
     def prepare_inputs_for_generation(
-        self, input_ids, past_key_values=None, attention_mask=None, inputs_embeds=None, **kwargs
     ):
         # Omit tokens covered by past_key_values
         past_length = 0
@@ -632,7 +697,10 @@ class DeepseekOCRForCausalLM(DeepseekV2ForCausalLM):
             # 1 - If the length of the attention_mask exceeds the length of input_ids, then we are in a setting where
             # some of the inputs are exclusively passed as part of the cache (e.g. when passing input_embeds as
             # input)
-            if attention_mask is not None and attention_mask.shape[1] > input_ids.shape[1]:
                 input_ids = input_ids[:, -(attention_mask.shape[1] - past_length) :]
             # 2 - If the past_length is smaller than input_ids', then input_ids holds all input tokens. We can discard
             # input_ids based on the past_length.
@@ -668,7 +736,11 @@ class DeepseekOCRForCausalLM(DeepseekV2ForCausalLM):
         # TODO @gante we should only keep a `cache_position` in generate, and do +=1.
         # same goes for position ids. Could also help with continued generation.
-        cache_position = torch.arange(past_length, past_length + position_ids.shape[-1], device=position_ids.device)
         # if `inputs_embeds` are passed, we only want to use them in the 1st generation step
         if inputs_embeds is not None and past_key_values is None:
@@ -688,45 +760,55 @@ class DeepseekOCRForCausalLM(DeepseekV2ForCausalLM):
             }
         )
         return model_inputs
     def disable_torch_init(self):
         """
         Disable the redundant torch default initialization to accelerate model creation.
         """
         import torch
         setattr(torch.nn.Linear, "reset_parameters", lambda self: None)
         setattr(torch.nn.LayerNorm, "reset_parameters", lambda self: None)
-    def infer(self, tokenizer, prompt='', image_file='', output_path = '', base_size=1024, image_size=640, crop_mode=True, test_compress=False, save_results=False, eval_mode=False):
         self.disable_torch_init()
         os.makedirs(output_path, exist_ok=True)
-        os.makedirs(f'{output_path}/images', exist_ok=True)
         if prompt and image_file:
             conversation = [
                 {
                     "role": "<|User|>",
                     # "content": "<image>\n<|grounding|>Given the layout of the image. ",
-                    "content": f'{prompt}',
                     # "content": "君不见黄河之水天上来的下一句是什么？",
                     # "content": "<image>\nFree OCR. ",
                     # "content": "<image>\nParse the figure. ",
                     # "content": "<image>\nExtract the text in the image. ",
-                    "images": [f'{image_file}'],
                 },
                 {"role": "<|Assistant|>", "content": ""},
             ]
         elif prompt:
             conversation = [
                 {
                     "role": "<|User|>",
                     # "content": "<image>\n<|grounding|>Given the layout of the image. ",
-                    "content": f'{prompt}',
                     # "content": "君不见黄河之水天上来的下一句是什么？",
                     # "content": "<image>\nFree OCR. ",
                     # "content": "<image>\nParse the figure. ",
@@ -736,9 +818,11 @@ class DeepseekOCRForCausalLM(DeepseekV2ForCausalLM):
                 {"role": "<|Assistant|>", "content": ""},
             ]
         else:
-            assert False, f'prompt is none!'
-        prompt = format_messages(conversations=conversation, sft_format='plain', system_prompt='')
         patch_size = 16
         downsample_ratio = 4
@@ -749,15 +833,16 @@ class DeepseekOCRForCausalLM(DeepseekV2ForCausalLM):
         image_draw = images[0].copy()
-        w,h = image_draw.size
         # print(w, h)
         ratio = 1 - ((max(w, h) - min(w, h)) / (max(w, h)))
-        image_transform=BasicImageTransform(mean=(0.5, 0.5, 0.5), std=(0.5, 0.5, 0.5), normalize=True)
         images_seq_mask = []
-        image_token = '<image>'
         image_token_id = 128815
         text_splits = prompt.split(image_token)
@@ -765,13 +850,11 @@ class DeepseekOCRForCausalLM(DeepseekV2ForCausalLM):
         tokenized_str = []
         images_spatial_crop = []
         for text_sep, image in zip(text_splits, images):
             tokenized_sep = text_encode(tokenizer, text_sep, bos=False, eos=False)
             tokenized_str += tokenized_sep
             images_seq_mask += [False] * len(tokenized_sep)
             if crop_mode:
                 if image.size[0] <= 640 and image.size[1] <= 640:
                     crop_ratio = [1, 1]
@@ -782,23 +865,22 @@ class DeepseekOCRForCausalLM(DeepseekV2ForCausalLM):
                     else:
                         # best_width, best_height = self.image_size, self.image_size
                         crop_ratio = [1, 1]
                 """process the global view"""
                 # image = image.resize((base_size, base_size))
-                global_view = ImageOps.pad(image, (base_size, base_size),
-                                        color=tuple(int(x * 255) for x in image_transform.mean))
                 if base_size == 1024:
                     valid_img_tokens += int(256 * ratio)
                 elif base_size == 1280:
                     valid_img_tokens += int(400 * ratio)
                 # elif base_size == 640:
                 #     valid_img_tokens += int(100 * ratio)
                 images_list.append(image_transform(global_view).to(torch.bfloat16))
                 # global_view_tensor = image_transform(global_view).to(torch.bfloat16)
@@ -806,31 +888,34 @@ class DeepseekOCRForCausalLM(DeepseekV2ForCausalLM):
                 width_crop_num, height_crop_num = crop_ratio
                 images_spatial_crop.append([width_crop_num, height_crop_num])
                 if width_crop_num > 1 or height_crop_num > 1:
                     """process the local views"""
                     for i in range(len(images_crop_raw)):
-                        images_crop_list.append(image_transform(images_crop_raw[i]).to(torch.bfloat16))
                 if image_size == 640:
                     valid_img_tokens += len(images_crop_list) * 100
                 num_queries = math.ceil((image_size // patch_size) / downsample_ratio)
-                num_queries_base = math.ceil((base_size // patch_size) / downsample_ratio)
                 """add image tokens"""
-                tokenized_image = ([image_token_id] * num_queries_base + [image_token_id]) * num_queries_base
                 tokenized_image += [image_token_id]
                 if width_crop_num > 1 or height_crop_num > 1:
-                    tokenized_image += ([image_token_id] * (num_queries * width_crop_num) + [image_token_id]) * (
-                                num_queries * height_crop_num)
                 tokenized_str += tokenized_image
                 images_seq_mask += [True] * len(tokenized_image)
                 # num_image_tokens.append(len(tokenized_image))
@@ -841,11 +926,14 @@ class DeepseekOCRForCausalLM(DeepseekV2ForCausalLM):
                 """process the global view"""
                 if image_size <= 640:
-                    print('directly resize')
                     image = image.resize((image_size, image_size))
                 # else:
-                global_view = ImageOps.pad(image, (image_size, image_size),
-                                        color=tuple(int(x * 255) for x in image_transform.mean))
                 images_list.append(image_transform(global_view).to(torch.bfloat16))
                 if base_size == 1024:
@@ -861,18 +949,18 @@ class DeepseekOCRForCausalLM(DeepseekV2ForCausalLM):
                 images_spatial_crop.append([width_crop_num, height_crop_num])
                 """add image tokens"""
                 num_queries = math.ceil((image_size // patch_size) / downsample_ratio)
-                tokenized_image = ([image_token_id] * num_queries + [image_token_id]) * num_queries
                 tokenized_image += [image_token_id]
                 # tokenized_image += ([self.image_token_id] * (num_queries * width_crop_num) + [self.image_token_id]) * (
                 #             num_queries * height_crop_num)
                 tokenized_str += tokenized_image
                 images_seq_mask += [True] * len(tokenized_image)
                 # num_image_tokens.append(len(tokenized_image))
         """process the last text split"""
         tokenized_sep = text_encode(tokenizer, text_splits[-1], bos=False, eos=False)
@@ -881,19 +969,13 @@ class DeepseekOCRForCausalLM(DeepseekV2ForCausalLM):
         """add the bos tokens"""
         bos_id = 0
-        tokenized_str = [bos_id] + tokenized_str
         images_seq_mask = [False] + images_seq_mask
         input_ids = torch.LongTensor(tokenized_str)
         images_seq_mask = torch.tensor(images_seq_mask, dtype=torch.bool)
         if len(images_list) == 0:
             images_ori = torch.zeros((1, 3, image_size, image_size))
             images_spatial_crop = torch.zeros((1, 2), dtype=torch.long)
@@ -907,131 +989,157 @@ class DeepseekOCRForCausalLM(DeepseekV2ForCausalLM):
             else:
                 images_crop = torch.zeros((1, 3, base_size, base_size))
         if not eval_mode:
-            streamer = NoEOSTextStreamer(tokenizer, skip_prompt=True, skip_special_tokens=False)
-            with torch.autocast("cuda", dtype=torch.bfloat16):
                 with torch.no_grad():
                     output_ids = self.generate(
-                        input_ids.unsqueeze(0).cuda(),
-                        images=[(images_crop.cuda(), images_ori.cuda())],
-                        images_seq_mask = images_seq_mask.unsqueeze(0).cuda(),
-                        images_spatial_crop = images_spatial_crop,
                         # do_sample=False,
                         # num_beams = 1,
                         temperature=0.0,
                         eos_token_id=tokenizer.eos_token_id,
                         streamer=streamer,
                         max_new_tokens=8192,
-                        no_repeat_ngram_size = 20,
-                        use_cache = True
-                        )
         else:
-            with torch.autocast("cuda", dtype=torch.bfloat16):
                 with torch.no_grad():
                     output_ids = self.generate(
-                        input_ids.unsqueeze(0).cuda(),
-                        images=[(images_crop.cuda(), images_ori.cuda())],
-                        images_seq_mask = images_seq_mask.unsqueeze(0).cuda(),
-                        images_spatial_crop = images_spatial_crop,
                         # do_sample=False,
                         # num_beams = 1,
                         temperature=0.0,
                         eos_token_id=tokenizer.eos_token_id,
                         max_new_tokens=8192,
-                        no_repeat_ngram_size = 35,
-                        use_cache = True
-                        )
-        if '<image>' in conversation[0]['content'] and eval_mode:
-                outputs = tokenizer.decode(output_ids[0, input_ids.unsqueeze(0).cuda().shape[1]:])
-                stop_str = '<｜end▁of▁sentence｜>'
-                if outputs.endswith(stop_str):
-                    outputs = outputs[:-len(stop_str)]
-                # re_match
-                outputs = outputs.strip()
-                return outputs
-        if '<image>' in conversation[0]['content'] and test_compress:
-            outputs = tokenizer.decode(output_ids[0, input_ids.unsqueeze(0).cuda().shape[1]:])
-            pure_texts_outputs_token_length = len(text_encode(tokenizer, outputs, bos=False, eos=False))
-            print('='*50)
-            print('image size: ', (w, h))
-            print('valid image tokens: ', int(valid_img_tokens))
-            print('output texts tokens (valid): ', pure_texts_outputs_token_length)
-            print('compression ratio: ', round(pure_texts_outputs_token_length/valid_img_tokens, 2))
-            print('='*50)
-        if '<image>' in conversation[0]['content'] and save_results:
-            outputs = tokenizer.decode(output_ids[0, input_ids.unsqueeze(0).cuda().shape[1]:])
-            stop_str = '<｜end▁of▁sentence｜>'
-            print('='*15 + 'save results:' + '='*15)
             # # # # conv.messages[-1][-1] = outputs
             if outputs.endswith(stop_str):
-                outputs = outputs[:-len(stop_str)]
             outputs = outputs.strip()
             matches_ref, matches_images, mathes_other = re_match(outputs)
             # print(matches_ref)
             result = process_image_with_refs(image_draw, matches_ref, output_path)
             for idx, a_match_image in enumerate(tqdm(matches_images, desc="image")):
-                outputs = outputs.replace(a_match_image, '![](images/' + str(idx) + '.jpg)\n')
-            for idx, a_match_other in enumerate(tqdm(mathes_other, desc="other")):
-                outputs = outputs.replace(a_match_other, '').replace('\\coloneqq', ':=').replace('\\eqqcolon', '=:')
             # if 'structural formula' in conversation[0]['content']:
             #     outputs = '<smiles>' + outputs + '</smiles>'
-            with open(f'{output_path}/result.mmd', 'w', encoding = 'utf-8') as afile:
                 afile.write(outputs)
-            if 'line_type' in outputs:
                 import matplotlib.pyplot as plt
-                lines = eval(outputs)['Line']['line']
-                line_type = eval(outputs)['Line']['line_type']
                 # print(lines)
-                endpoints = eval(outputs)['Line']['line_endpoint']
-                fig, ax = plt.subplots(figsize=(3,3), dpi=200)
                 ax.set_xlim(-15, 15)
                 ax.set_ylim(-15, 15)
                 for idx, line in enumerate(lines):
                     try:
-                        p0 = eval(line.split(' -- ')[0])
-                        p1 = eval(line.split(' -- ')[-1])
-                        if line_type[idx] == '--':
-                            ax.plot([p0[0], p1[0]], [p0[1], p1[1]], linewidth=0.8, color='k')
                         else:
-                            ax.plot([p0[0], p1[0]], [p0[1], p1[1]], linewidth = 0.8, color = 'k')
-                        ax.scatter(p0[0], p0[1], s=5, color = 'k')
-                        ax.scatter(p1[0], p1[1], s=5, color = 'k')
                     except:
                         pass
                 for endpoint in endpoints:
-                    label = endpoint.split(': ')[0]
-                    (x, y) = eval(endpoint.split(': ')[1])
-                    ax.annotate(label, (x, y), xytext=(1, 1), textcoords='offset points',
-                                fontsize=5, fontweight='light')
-                plt.savefig(f'{output_path}/geo.jpg')
                 plt.close()
             result.save(f"{output_path}/result_with_boxes.jpg")

 from .modeling_deepseekv2 import DeepseekV2Model, DeepseekV2ForCausalLM
 from .configuration_deepseek_v2 import DeepseekV2Config
+from transformers.modeling_outputs import (
+    BaseModelOutputWithPast,
+    CausalLMOutputWithPast,
+)
 from typing import List, Optional, Tuple, Union
 from transformers.cache_utils import Cache
 import requests
 def load_image(image_path):
     try:
         image = Image.open(image_path)
         corrected_image = ImageOps.exif_transpose(image)
         return corrected_image
     except Exception as e:
         print(f"error: {e}")
         try:
 def re_match(text):
+    pattern = r"(<\|ref\|>(.*?)<\|/ref\|><\|det\|>(.*?)<\|/det\|>)"
     matches = re.findall(pattern, text, re.DOTALL)
     # pattern1 = r'<\|ref\|>.*?<\|/ref\|>\n'
     mathes_image = []
     mathes_other = []
     for a_match in matches:
+        if "<|ref|>image<|/ref|>" in a_match[0]:
             mathes_image.append(a_match[0])
         else:
             mathes_other.append(a_match[0])
 def extract_coordinates_and_label(ref_text, image_width, image_height):
     try:
         label_type = ref_text[1]
         cor_list = eval(ref_text[2])
 def draw_bounding_boxes(image, refs, ouput_path):
     image_width, image_height = image.size
     img_draw = image.copy()
     draw = ImageDraw.Draw(img_draw)
+    overlay = Image.new("RGBA", img_draw.size, (0, 0, 0, 0))
     draw2 = ImageDraw.Draw(overlay)
     # try:
     # except IOError:
     #     try:
+    #         font = ImageFont.truetype("DejaVuSans.ttf", 20)
     #     except IOError:
     font = ImageFont.load_default()
     img_idx = 0
     for i, ref in enumerate(refs):
         try:
             result = extract_coordinates_and_label(ref, image_width, image_height)
             if result:
                 label_type, points_list = result
+                color = (
+                    np.random.randint(0, 200),
+                    np.random.randint(0, 200),
+                    np.random.randint(0, 255),
+                )
+                color_a = color + (20,)
                 for points in points_list:
                     x1, y1, x2, y2 = points
                     x2 = int(x2 / 999 * image_width)
                     y2 = int(y2 / 999 * image_height)
+                    if label_type == "image":
                         try:
                             cropped = image.crop((x1, y1, x2, y2))
                             cropped.save(f"{ouput_path}/images/{img_idx}.jpg")
                             print(e)
                             pass
                         img_idx += 1
                     try:
+                        if label_type == "title":
                             draw.rectangle([x1, y1, x2, y2], outline=color, width=4)
+                            draw2.rectangle(
+                                [x1, y1, x2, y2],
+                                fill=color_a,
+                                outline=(0, 0, 0, 0),
+                                width=1,
+                            )
                         else:
                             draw.rectangle([x1, y1, x2, y2], outline=color, width=2)
+                            draw2.rectangle(
+                                [x1, y1, x2, y2],
+                                fill=color_a,
+                                outline=(0, 0, 0, 0),
+                                width=1,
+                            )
                         text_x = x1
                         text_y = max(0, y1 - 15)
                         text_bbox = draw.textbbox((0, 0), label_type, font=font)
                         text_width = text_bbox[2] - text_bbox[0]
                         text_height = text_bbox[3] - text_bbox[1]
+                        draw.rectangle(
+                            [text_x, text_y, text_x + text_width, text_y + text_height],
+                            fill=(255, 255, 255, 30),
+                        )
                         draw.text((text_x, text_y), label_type, font=font, fill=color)
                     except:
                         pass
 def process_image_with_refs(image, ref_texts, output_path):
     result_image = draw_bounding_boxes(image, ref_texts, output_path)
+    return result_image
 def find_closest_aspect_ratio(aspect_ratio, target_ratios, width, height, image_size):
+    best_ratio_diff = float("inf")
     best_ratio = (1, 1)
     area = width * height
     for ratio in target_ratios:
     return best_ratio
+def dynamic_preprocess(
+    image, min_num=2, max_num=9, image_size=640, use_thumbnail=False
+):
     orig_width, orig_height = image.size
     aspect_ratio = orig_width / orig_height
     # calculate the existing image aspect ratio
     target_ratios = set(
+        (i, j)
+        for n in range(min_num, max_num + 1)
+        for i in range(1, n + 1)
+        for j in range(1, n + 1)
+        if i * j <= max_num and i * j >= min_num
+    )
     # print(target_ratios)
     target_ratios = sorted(target_ratios, key=lambda x: x[0] * x[1])
     # find the closest aspect ratio to the target
     target_aspect_ratio = find_closest_aspect_ratio(
+        aspect_ratio, target_ratios, orig_width, orig_height, image_size
+    )
     # print(target_aspect_ratio)
     # calculate the target width and height
             (i % (target_width // image_size)) * image_size,
             (i // (target_width // image_size)) * image_size,
             ((i % (target_width // image_size)) + 1) * image_size,
+            ((i // (target_width // image_size)) + 1) * image_size,
         )
         # split the image
         split_img = resized_img.crop(box)
     return processed_images, target_aspect_ratio
 def normalize_transform(mean, std):
     if mean is None and std is None:
         transform = None
     elif mean is None and std is not None:
+        mean = [0.0] * len(std)
         transform = transforms.Normalize(mean=mean, std=std)
     elif mean is not None and std is None:
+        std = [1.0] * len(mean)
         transform = transforms.Normalize(mean=mean, std=std)
     else:
         transform = transforms.Normalize(mean=mean, std=std)
     return transform
 def format_messages(
+    conversations: List[Dict[str, str]],
+    sft_format: str = "deepseek",
+    system_prompt: str = "",
 ):
     """
     Applies the SFT template to conversation.
     return t
 def load_pil_images(conversations: List[Dict[str, str]]) -> List[Image.Image]:
     """
             # print(image_path)
             # print('----------------')
             # exit()
             # pil_img = Image.open(image_path)
             pil_img = load_image(image_path)
             pil_img = pil_img.convert("RGB")
 class BaseTransform(ABC):
     def set_rng(self, *args, **kwargs):
         pass
 class BasicImageTransform(BaseTransform):
     def __init__(
+        self,
         mean: Optional[Tuple[float, float, float]] = (0.5, 0.5, 0.5),
         std: Optional[Tuple[float, float, float]] = (0.5, 0.5, 0.5),
+        normalize: bool = True,
     ):
         self.mean = mean
         self.std = std
+        transform_pipelines = [transforms.ToTensor()]
         normalize = normalize_transform(mean, std) if normalize else nn.Identity()
         if normalize is not None:
             transform_pipelines.append(normalize)
         self.transform = transforms.Compose(transform_pipelines)
     def __call__(self, x):
         x = self.transform(x)
         return x
 class NoEOSTextStreamer(TextStreamer):
     def on_finalized_text(self, text: str, stream_end: bool = False):
+        eos_text = self.tokenizer.decode(
+            [self.tokenizer.eos_token_id], skip_special_tokens=False
+        )
         text = text.replace(eos_text, "\n")
         print(text, flush=True, end="")
 class DeepseekOCRConfig(DeepseekV2Config):
     model_type = "DeepseekOCR"
 class DeepseekOCRModel(DeepseekV2Model):
     config_class = DeepseekOCRConfig
         self.vision_model = build_clip_l()
         # self.conv_2 = nn.Conv2d(in_channels=1024, out_channels=2048, kernel_size=2, stride=2)
         n_embed = 1280
+        self.projector = MlpProjector(
+            Dict(projector_type="linear", input_dim=2048, n_embed=n_embed)
+        )
         embed_std = 1 / torch.sqrt(torch.tensor(n_embed, dtype=torch.float32))
         self.image_newline = nn.Parameter(torch.randn(n_embed) * embed_std)
         self.view_seperator = nn.Parameter(torch.randn(n_embed) * embed_std)
     def forward(
         self,
         input_ids: torch.LongTensor = None,
         images_spatial_crop: Optional[torch.FloatTensor] = None,
         return_dict: Optional[bool] = None,
     ) -> Union[Tuple, BaseModelOutputWithPast]:
         if inputs_embeds is None:
             # inputs_embeds = self.embed_tokens(input_ids)
             inputs_embeds = self.get_input_embeddings()(input_ids)
+        sam_model = getattr(self, "sam_model", None)
         # sam_model = self.sam_model
+        vision_model = getattr(self, "vision_model", None)
+        if (
+            sam_model is not None
+            and (input_ids.shape[1] != 1 or self.training)
+            and torch.sum(images[0][1]).item() != 0
+        ):
             idx = 0
             # sam_model = torch.jit.script(sam_model)
             # start_time = time.time()
             for image, crop_shape in zip(images, images_spatial_crop):
                 images_in_this_batch = []
                 image_ori = image[1]
                 with torch.no_grad():
+                    # with torch.inference_mode():
                     if torch.sum(patches).item() != 0:
                         # P, C, H, W = patches.shape
                         crop_flag = 1
                         local_features_1 = sam_model(patches)
+                        local_features_2 = vision_model(patches, local_features_1)
                         # vit_time = time.time()
+                        local_features = torch.cat(
+                            (
+                                local_features_2[:, 1:],
+                                local_features_1.flatten(2).permute(0, 2, 1),
+                            ),
+                            dim=-1,
+                        )
                         local_features = self.projector(local_features)
                         global_features_1 = sam_model(image_ori)
+                        global_features_2 = vision_model(image_ori, global_features_1)
+                        global_features = torch.cat(
+                            (
+                                global_features_2[:, 1:],
+                                global_features_1.flatten(2).permute(0, 2, 1),
+                            ),
+                            dim=-1,
+                        )
                         global_features = self.projector(global_features)
+                        print("=====================")
+                        print("BASE: ", global_features.shape)
+                        print("PATCHES: ", local_features.shape)
+                        print("=====================")
                         _, hw, n_dim = global_features.shape
+                        h = w = int(hw**0.5)
                         _2, hw2, n_dim2 = local_features.shape
+                        h2 = w2 = int(hw2**0.5)
                         width_crop_num, height_crop_num = crop_shape[0], crop_shape[1]
                         global_features = global_features.view(h, w, n_dim)
                         global_features = torch.cat(
+                            [
+                                global_features,
+                                self.image_newline[None, None, :].expand(h, 1, n_dim),
+                            ],
+                            dim=1,
                         )
                         global_features = global_features.view(-1, n_dim)
+                        local_features = (
+                            local_features.view(
+                                height_crop_num, width_crop_num, h2, w2, n_dim2
+                            )
+                            .permute(0, 2, 1, 3, 4)
+                            .reshape(height_crop_num * h2, width_crop_num * w2, n_dim2)
+                        )
                         local_features = torch.cat(
+                            [
+                                local_features,
+                                self.image_newline[None, None, :].expand(
+                                    height_crop_num * h2, 1, n_dim2
+                                ),
+                            ],
+                            dim=1,
                         )
                         local_features = local_features.view(-1, n_dim2)
+                        global_local_features = torch.cat(
+                            [
+                                local_features,
+                                global_features,
+                                self.view_seperator[None, :],
+                            ],
+                            dim=0,
+                        )
                         # end_time = time.time()
                         # print('all: ', end_time - start_time)
                         # exit()
                     else:
                         global_features_1 = sam_model(image_ori)
+                        global_features_2 = vision_model(image_ori, global_features_1)
+                        global_features = torch.cat(
+                            (
+                                global_features_2[:, 1:],
+                                global_features_1.flatten(2).permute(0, 2, 1),
+                            ),
+                            dim=-1,
+                        )
                         global_features = self.projector(global_features)
+                        print("=====================")
+                        print("BASE: ", global_features.shape)
+                        print("NO PATCHES")
+                        print("=====================")
                         _, hw, n_dim = global_features.shape
+                        h = w = int(hw**0.5)
                         global_features = global_features.view(h, w, n_dim)
                         global_features = torch.cat(
+                            [
+                                global_features,
+                                self.image_newline[None, None, :].expand(h, 1, n_dim),
+                            ],
+                            dim=1,
                         )
                         global_features = global_features.view(-1, n_dim)
+                        global_local_features = torch.cat(
+                            [global_features, self.view_seperator[None, :]], dim=0
+                        )
                     images_in_this_batch.append(global_local_features)
                 # print(inputs_embeds.shape)
                     images_in_this_batch = torch.cat(images_in_this_batch, dim=0)
                     # exit()
+                    inputs_embeds[idx].masked_scatter_(
+                        images_seq_mask[idx].unsqueeze(-1).to(self.device),
+                        images_in_this_batch,
+                    )
                 idx += 1
         return super(DeepseekOCRModel, self).forward(
+            input_ids=None,
+            attention_mask=attention_mask,
+            past_key_values=past_key_values,
+            inputs_embeds=inputs_embeds,
+            use_cache=use_cache,
+            position_ids=position_ids,
+            output_attentions=output_attentions,
+            output_hidden_states=output_hidden_states,
+            return_dict=return_dict,
         )
+class DeepseekOCRForCausalLM(DeepseekV2ForCausalLM):
     config_class = DeepseekOCRConfig
     # supports_gradient_checkpointing = True
     def get_model(self):
         return self.model
     def forward(
         self,
         input_ids: torch.LongTensor = None,
         images_seq_mask: Optional[torch.FloatTensor] = None,
         images_spatial_crop: Optional[torch.FloatTensor] = None,
         return_dict: Optional[bool] = None,
     ) -> Union[Tuple, CausalLMOutputWithPast]:
+        output_attentions = (
+            output_attentions
+            if output_attentions is not None
+            else self.config.output_attentions
+        )
         output_hidden_states = (
+            output_hidden_states
+            if output_hidden_states is not None
+            else self.config.output_hidden_states
+        )
+        return_dict = (
+            return_dict if return_dict is not None else self.config.use_return_dict
         )
+        outputs = self.model(
             input_ids=input_ids,
             past_key_values=past_key_values,
             attention_mask=attention_mask,
             output_attentions=output_attentions,
             output_hidden_states=output_hidden_states,
             images=images,
+            images_seq_mask=images_seq_mask,
+            images_spatial_crop=images_spatial_crop,
+            return_dict=return_dict,
         )
         # print(transformer_outputs)
         hidden_states = outputs[0]
             attentions=outputs.attentions,
         )
     def prepare_inputs_for_generation(
+        self,
+        input_ids,
+        past_key_values=None,
+        attention_mask=None,
+        inputs_embeds=None,
+        **kwargs,
     ):
         # Omit tokens covered by past_key_values
         past_length = 0
             # 1 - If the length of the attention_mask exceeds the length of input_ids, then we are in a setting where
             # some of the inputs are exclusively passed as part of the cache (e.g. when passing input_embeds as
             # input)
+            if (
+                attention_mask is not None
+                and attention_mask.shape[1] > input_ids.shape[1]
+            ):
                 input_ids = input_ids[:, -(attention_mask.shape[1] - past_length) :]
             # 2 - If the past_length is smaller than input_ids', then input_ids holds all input tokens. We can discard
             # input_ids based on the past_length.
         # TODO @gante we should only keep a `cache_position` in generate, and do +=1.
         # same goes for position ids. Could also help with continued generation.
+        cache_position = torch.arange(
+            past_length,
+            past_length + position_ids.shape[-1],
+            device=position_ids.device,
+        )
         # if `inputs_embeds` are passed, we only want to use them in the 1st generation step
         if inputs_embeds is not None and past_key_values is None:
             }
         )
         return model_inputs
     def disable_torch_init(self):
         """
         Disable the redundant torch default initialization to accelerate model creation.
         """
         import torch
         setattr(torch.nn.Linear, "reset_parameters", lambda self: None)
         setattr(torch.nn.LayerNorm, "reset_parameters", lambda self: None)
+    def infer(
+        self,
+        tokenizer,
+        prompt="",
+        image_file="",
+        output_path="",
+        base_size=1024,
+        image_size=640,
+        crop_mode=True,
+        test_compress=False,
+        save_results=False,
+        eval_mode=False,
+    ):
         self.disable_torch_init()
         os.makedirs(output_path, exist_ok=True)
+        os.makedirs(f"{output_path}/images", exist_ok=True)
         if prompt and image_file:
             conversation = [
                 {
                     "role": "<|User|>",
                     # "content": "<image>\n<|grounding|>Given the layout of the image. ",
+                    "content": f"{prompt}",
                     # "content": "君不见黄河之水天上来的下一句是什么？",
                     # "content": "<image>\nFree OCR. ",
                     # "content": "<image>\nParse the figure. ",
                     # "content": "<image>\nExtract the text in the image. ",
+                    "images": [f"{image_file}"],
                 },
                 {"role": "<|Assistant|>", "content": ""},
             ]
         elif prompt:
             conversation = [
                 {
                     "role": "<|User|>",
                     # "content": "<image>\n<|grounding|>Given the layout of the image. ",
+                    "content": f"{prompt}",
                     # "content": "君不见黄河之水天上来的下一句是什么？",
                     # "content": "<image>\nFree OCR. ",
                     # "content": "<image>\nParse the figure. ",
                 {"role": "<|Assistant|>", "content": ""},
             ]
         else:
+            assert False, f"prompt is none!"
+        prompt = format_messages(
+            conversations=conversation, sft_format="plain", system_prompt=""
+        )
         patch_size = 16
         downsample_ratio = 4
         image_draw = images[0].copy()
+        w, h = image_draw.size
         # print(w, h)
         ratio = 1 - ((max(w, h) - min(w, h)) / (max(w, h)))
+        image_transform = BasicImageTransform(
+            mean=(0.5, 0.5, 0.5), std=(0.5, 0.5, 0.5), normalize=True
+        )
         images_seq_mask = []
+        image_token = "<image>"
         image_token_id = 128815
         text_splits = prompt.split(image_token)
         tokenized_str = []
         images_spatial_crop = []
         for text_sep, image in zip(text_splits, images):
             tokenized_sep = text_encode(tokenizer, text_sep, bos=False, eos=False)
             tokenized_str += tokenized_sep
             images_seq_mask += [False] * len(tokenized_sep)
             if crop_mode:
                 if image.size[0] <= 640 and image.size[1] <= 640:
                     crop_ratio = [1, 1]
                     else:
                         # best_width, best_height = self.image_size, self.image_size
                         crop_ratio = [1, 1]
                 """process the global view"""
                 # image = image.resize((base_size, base_size))
+                global_view = ImageOps.pad(
+                    image,
+                    (base_size, base_size),
+                    color=tuple(int(x * 255) for x in image_transform.mean),
+                )
                 if base_size == 1024:
                     valid_img_tokens += int(256 * ratio)
                 elif base_size == 1280:
                     valid_img_tokens += int(400 * ratio)
                 # elif base_size == 640:
                 #     valid_img_tokens += int(100 * ratio)
                 images_list.append(image_transform(global_view).to(torch.bfloat16))
                 # global_view_tensor = image_transform(global_view).to(torch.bfloat16)
                 width_crop_num, height_crop_num = crop_ratio
                 images_spatial_crop.append([width_crop_num, height_crop_num])
                 if width_crop_num > 1 or height_crop_num > 1:
                     """process the local views"""
                     for i in range(len(images_crop_raw)):
+                        images_crop_list.append(
+                            image_transform(images_crop_raw[i]).to(torch.bfloat16)
+                        )
                 if image_size == 640:
                     valid_img_tokens += len(images_crop_list) * 100
                 num_queries = math.ceil((image_size // patch_size) / downsample_ratio)
+                num_queries_base = math.ceil(
+                    (base_size // patch_size) / downsample_ratio
+                )
                 """add image tokens"""
+                tokenized_image = (
+                    [image_token_id] * num_queries_base + [image_token_id]
+                ) * num_queries_base
                 tokenized_image += [image_token_id]
                 if width_crop_num > 1 or height_crop_num > 1:
+                    tokenized_image += (
+                        [image_token_id] * (num_queries * width_crop_num)
+                        + [image_token_id]
+                    ) * (num_queries * height_crop_num)
                 tokenized_str += tokenized_image
                 images_seq_mask += [True] * len(tokenized_image)
                 # num_image_tokens.append(len(tokenized_image))
                 """process the global view"""
                 if image_size <= 640:
+                    print("directly resize")
                     image = image.resize((image_size, image_size))
                 # else:
+                global_view = ImageOps.pad(
+                    image,
+                    (image_size, image_size),
+                    color=tuple(int(x * 255) for x in image_transform.mean),
+                )
                 images_list.append(image_transform(global_view).to(torch.bfloat16))
                 if base_size == 1024:
                 images_spatial_crop.append([width_crop_num, height_crop_num])
                 """add image tokens"""
                 num_queries = math.ceil((image_size // patch_size) / downsample_ratio)
+                tokenized_image = (
+                    [image_token_id] * num_queries + [image_token_id]
+                ) * num_queries
                 tokenized_image += [image_token_id]
                 # tokenized_image += ([self.image_token_id] * (num_queries * width_crop_num) + [self.image_token_id]) * (
                 #             num_queries * height_crop_num)
                 tokenized_str += tokenized_image
                 images_seq_mask += [True] * len(tokenized_image)
                 # num_image_tokens.append(len(tokenized_image))
         """process the last text split"""
         tokenized_sep = text_encode(tokenizer, text_splits[-1], bos=False, eos=False)
         """add the bos tokens"""
         bos_id = 0
+        tokenized_str = [bos_id] + tokenized_str
         images_seq_mask = [False] + images_seq_mask
         input_ids = torch.LongTensor(tokenized_str)
         images_seq_mask = torch.tensor(images_seq_mask, dtype=torch.bool)
         if len(images_list) == 0:
             images_ori = torch.zeros((1, 3, image_size, image_size))
             images_spatial_crop = torch.zeros((1, 2), dtype=torch.long)
             else:
                 images_crop = torch.zeros((1, 3, base_size, base_size))
         if not eval_mode:
+            streamer = NoEOSTextStreamer(
+                tokenizer, skip_prompt=True, skip_special_tokens=False
+            )
+            with torch.autocast(self.device.type, dtype=torch.bfloat16):
                 with torch.no_grad():
                     output_ids = self.generate(
+                        input_ids.unsqueeze(0).to(self.device),
+                        images=[
+                            (images_crop.to(self.device), images_ori.to(self.device))
+                        ],
+                        images_seq_mask=images_seq_mask.unsqueeze(0).to(self.device),
+                        images_spatial_crop=images_spatial_crop,
                         # do_sample=False,
                         # num_beams = 1,
                         temperature=0.0,
                         eos_token_id=tokenizer.eos_token_id,
                         streamer=streamer,
                         max_new_tokens=8192,
+                        no_repeat_ngram_size=20,
+                        use_cache=True,
+                    )
         else:
+            with torch.autocast(self.device.type, dtype=torch.bfloat16):
                 with torch.no_grad():
                     output_ids = self.generate(
+                        input_ids.unsqueeze(0).to(self.device),
+                        images=[
+                            (images_crop.to(self.device), images_ori.to(self.device))
+                        ],
+                        images_seq_mask=images_seq_mask.unsqueeze(0).to(self.device),
+                        images_spatial_crop=images_spatial_crop,
                         # do_sample=False,
                         # num_beams = 1,
                         temperature=0.0,
                         eos_token_id=tokenizer.eos_token_id,
                         max_new_tokens=8192,
+                        no_repeat_ngram_size=35,
+                        use_cache=True,
+                    )
+        if "<image>" in conversation[0]["content"] and eval_mode:
+            outputs = tokenizer.decode(
+                output_ids[0, input_ids.unsqueeze(0).to(self.device).shape[1] :]
+            )
+            stop_str = "<｜end▁of▁sentence｜>"
+            if outputs.endswith(stop_str):
+                outputs = outputs[: -len(stop_str)]
+            # re_match
+            outputs = outputs.strip()
+            return outputs
+        if "<image>" in conversation[0]["content"] and test_compress:
+            outputs = tokenizer.decode(
+                output_ids[0, input_ids.unsqueeze(0).to(self.device).shape[1] :]
+            )
+            pure_texts_outputs_token_length = len(
+                text_encode(tokenizer, outputs, bos=False, eos=False)
+            )
+            print("=" * 50)
+            print("image size: ", (w, h))
+            print("valid image tokens: ", int(valid_img_tokens))
+            print("output texts tokens (valid): ", pure_texts_outputs_token_length)
+            print(
+                "compression ratio: ",
+                round(pure_texts_outputs_token_length / valid_img_tokens, 2),
+            )
+            print("=" * 50)
+        if "<image>" in conversation[0]["content"] and save_results:
+            outputs = tokenizer.decode(
+                output_ids[0, input_ids.unsqueeze(0).to(self.device).shape[1] :]
+            )
+            stop_str = "<｜end▁of▁sentence｜>"
+            print("=" * 15 + "save results:" + "=" * 15)
             # # # # conv.messages[-1][-1] = outputs
             if outputs.endswith(stop_str):
+                outputs = outputs[: -len(stop_str)]
             outputs = outputs.strip()
             matches_ref, matches_images, mathes_other = re_match(outputs)
             # print(matches_ref)
             result = process_image_with_refs(image_draw, matches_ref, output_path)
             for idx, a_match_image in enumerate(tqdm(matches_images, desc="image")):
+                outputs = outputs.replace(
+                    a_match_image, "![](images/" + str(idx) + ".jpg)\n"
+                )
+            for idx, a_match_other in enumerate(tqdm(mathes_other, desc="other")):
+                outputs = (
+                    outputs.replace(a_match_other, "")
+                    .replace("\\coloneqq", ":=")
+                    .replace("\\eqqcolon", "=:")
+                )
             # if 'structural formula' in conversation[0]['content']:
             #     outputs = '<smiles>' + outputs + '</smiles>'
+            with open(f"{output_path}/result.mmd", "w", encoding="utf-8") as afile:
                 afile.write(outputs)
+            if "line_type" in outputs:
                 import matplotlib.pyplot as plt
+                lines = eval(outputs)["Line"]["line"]
+                line_type = eval(outputs)["Line"]["line_type"]
                 # print(lines)
+                endpoints = eval(outputs)["Line"]["line_endpoint"]
+                fig, ax = plt.subplots(figsize=(3, 3), dpi=200)
                 ax.set_xlim(-15, 15)
                 ax.set_ylim(-15, 15)
                 for idx, line in enumerate(lines):
                     try:
+                        p0 = eval(line.split(" -- ")[0])
+                        p1 = eval(line.split(" -- ")[-1])
+                        if line_type[idx] == "--":
+                            ax.plot(
+                                [p0[0], p1[0]], [p0[1], p1[1]], linewidth=0.8, color="k"
+                            )
                         else:
+                            ax.plot(
+                                [p0[0], p1[0]], [p0[1], p1[1]], linewidth=0.8, color="k"
+                            )
+                        ax.scatter(p0[0], p0[1], s=5, color="k")
+                        ax.scatter(p1[0], p1[1], s=5, color="k")
                     except:
                         pass
                 for endpoint in endpoints:
+                    label = endpoint.split(": ")[0]
+                    (x, y) = eval(endpoint.split(": ")[1])
+                    ax.annotate(
+                        label,
+                        (x, y),
+                        xytext=(1, 1),
+                        textcoords="offset points",
+                        fontsize=5,
+                        fontweight="light",
+                    )
+                plt.savefig(f"{output_path}/geo.jpg")
                 plt.close()
             result.save(f"{output_path}/result_with_boxes.jpg")