Spaces:

Shadow0704
/

Agent_OCR

Sleeping

App Files Files Community

Shadow0704 commited on Oct 24

Commit

b85866b

verified ·

1 Parent(s): b133a37

Upload 5 files

Browse files

Files changed (5) hide show

app.py +87 -0
model.py +113 -0
preprocess.py +37 -0
requirements.txt +0 -0
vintern_fast.py +201 -0

app.py ADDED Viewed

	@@ -0,0 +1,87 @@

+# app.py
+import os
+import time
+from typing import Tuple
+import gradio as gr
+from PIL import Image
+import torch
+from model import OCRModel
+from preprocess import crop_by_region, to_tensor_one_tile  # dùng hàm sẵn có của bạn
+MODEL_ID = "5CD-AI/Vintern-1B-v3_5"
+# CPU free-tier -> allow_flash_attn=False; GPU A10G có thể bật True
+ocr_model = OCRModel(model_id=MODEL_ID, allow_flash_attn=False)
+DEFAULT_PROMPT = "Chỉ trả về đúng nội dung văn bản nhìn thấy trong ảnh (không thêm giải thích)."
+REGIONS = ["full", "head", "body", "foot"]
+PRESETS = ["fast", "quality"]
+def ensure_model_loaded():
+    if not ocr_model.is_loaded:
+        ocr_model.load()
+def run_ocr(
+    image: Image.Image,
+    region: str,
+    preset: str,
+    prompt: str,
+    max_new_tokens: int
+):
+    if image is None:
+        return "⚠️ Chưa chọn ảnh."
+    ensure_model_loaded()
+    # 1) Cắt vùng theo tham số (giống logic Flask cũ của bạn)
+    pil = crop_by_region(image, region=region, head_ratio=0.28, foot_ratio=0.22)
+    # 2) Đưa về tensor (1 tile / 448)
+    px = to_tensor_one_tile(pil, input_size=448)
+    # 3) Đồng bộ device & dtype với model (QUAN TRỌNG để tránh lỗi float/half)
+    model_dtype = next(ocr_model.model.parameters()).dtype
+    px = px.to(device=ocr_model.device, dtype=model_dtype)
+    # 4) Tham số sinh text
+    if preset == "fast":
+        gen = dict(max_new_tokens=min(512, max_new_tokens),
+                   do_sample=False, num_beams=1, repetition_penalty=1.05)
+    else:
+        gen = dict(max_new_tokens=max_new_tokens,
+                   do_sample=False, num_beams=1, repetition_penalty=1.10)
+    question = f"<image>\n{(prompt or DEFAULT_PROMPT).strip()}\n"
+    t0 = time.time()
+    text = ocr_model.chat(px, question, **gen)
+    dt = time.time() - t0
+    return f"{text}\n\n— elapsed: {dt:.2f}s | device: {ocr_model.device_str}"
+with gr.Blocks(title="OCR Demo (Gradio)") as demo:
+    gr.Markdown(
+        "# OCR Demo (Gradio)\n"
+        "Upload ảnh giấy tờ → chọn **vùng** → bấm **Extract**.\n"
+        f"Model: `{MODEL_ID}`"
+    )
+    with gr.Row():
+        with gr.Column(scale=1):
+            inp_img = gr.Image(type="pil", label="Ảnh", sources=["upload", "clipboard"])
+            region = gr.Radio(REGIONS, value="full", label="Vùng cắt")
+            preset = gr.Radio(PRESETS, value="fast", label="Chế độ")
+        with gr.Column(scale=1):
+            prompt = gr.Textbox(value=DEFAULT_PROMPT, label="Prompt", lines=3)
+            max_tokens = gr.Slider(16, 512, value=128, step=8, label="max_new_tokens")
+            btn = gr.Button("Extract nội dung", variant="primary")
+            out = gr.Textbox(label="Kết quả OCR", lines=18)
+    btn.click(run_ocr, [inp_img, region, preset, prompt, max_tokens], [out])
+if __name__ == "__main__":
+    # Local: mở http://127.0.0.1:7860
+    # Trên Hugging Face: không cần chỉnh — Spaces sẽ tự bind PORT
+    demo.launch()

model.py ADDED Viewed

	@@ -0,0 +1,113 @@

+# model.py
+import torch
+from transformers import AutoModel, AutoTokenizer, GenerationConfig
+class OCRModel:
+    def __init__(
+        self,
+        model_id: str = "5CD-AI/Vintern-1B-v3_5",
+        allow_flash_attn: bool = False,
+        prefer_bfloat16: bool = False,
+    ):
+        self.model_id = model_id
+        self.device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
+        if self.device.type == "cuda":
+            if prefer_bfloat16 and torch.cuda.is_bf16_supported():
+                self.dtype = torch.bfloat16
+            else:
+                self.dtype = torch.float16
+        else:
+            self.dtype = torch.float32
+        self.allow_flash_attn = bool(allow_flash_attn and self.device.type == "cuda")
+        self.model = None
+        self.tokenizer = None
+        self.is_loaded = False
+    @property
+    def on_cuda(self): return self.device.type == "cuda"
+    @property
+    def device_str(self): return f"{self.device} ({str(self.dtype)})"
+    def load(self):
+        self.tokenizer = AutoTokenizer.from_pretrained(self.model_id, trust_remote_code=True)
+        # ưu tiên API mới (dtype=), fallback torch_dtype nếu cần
+        try:
+            self.model = AutoModel.from_pretrained(
+                self.model_id, dtype=self.dtype, trust_remote_code=True
+            )
+        except TypeError:
+            self.model = AutoModel.from_pretrained(
+                self.model_id, torch_dtype=self.dtype, trust_remote_code=True
+            )
+        self.model.to(device=self.device, dtype=self.dtype)
+        self.model.eval()
+        if not hasattr(self.model, "generation_config") or self.model.generation_config is None:
+            self.model.generation_config = GenerationConfig()
+        self.is_loaded = True
+    def _build_gen_dict(self, **gen_kwargs) -> dict:
+        """
+        Trả về generation_config dạng DICT theo kỳ vọng của InternVLChatModel.chat(),
+        và LOẠI các khóa có thể bị truyền trùng trong .generate(...)
+        """
+        # base từ GenerationConfig hiện có
+        if hasattr(self.model, "generation_config") and self.model.generation_config is not None:
+            try:
+                base = self.model.generation_config.to_dict()
+            except Exception:
+                base = {}
+        else:
+            base = {}
+        # gộp tham số từ UI
+        for k, v in (gen_kwargs or {}).items():
+            base[k] = v
+        # Bổ sung token ids nếu thiếu
+        if "eos_token_id" not in base and hasattr(self.tokenizer, "eos_token_id"):
+            base["eos_token_id"] = self.tokenizer.eos_token_id
+        if "pad_token_id" not in base:
+            pad_id = getattr(self.tokenizer, "pad_token_id", None)
+            base["pad_token_id"] = pad_id if pad_id is not None else base.get("eos_token_id", None)
+        if "bos_token_id" not in base and hasattr(self.tokenizer, "bos_token_id"):
+            base["bos_token_id"] = self.tokenizer.bos_token_id
+        # ép kiểu int cho *_token_id
+        for key in ("eos_token_id", "pad_token_id", "bos_token_id"):
+            if key in base and base[key] is not None:
+                try:
+                    base[key] = int(base[key])
+                except Exception:
+                    pass
+        # 🚫 LOẠI các khóa dễ bị “multiple values”
+        for bad in ("use_cache", "output_attentions", "output_hidden_states",
+                    "return_dict_in_generate", "synced_gpus"):
+            base.pop(bad, None)
+        return base
+    def chat(self, pixel_values: torch.Tensor, question: str, **gen_kwargs) -> str:
+        if not self.is_loaded:
+            self.load()
+        # đồng bộ dtype/device input với model
+        model_dtype = next(self.model.parameters()).dtype
+        pixel_values = pixel_values.to(device=self.device, dtype=model_dtype)
+        # DICT sạch cho generation_config
+        gen_dict = self._build_gen_dict(**gen_kwargs)
+        # gọi chat: yêu cầu tokenizer + generation_config (DICT)
+        out = self.model.chat(
+            pixel_values=pixel_values,
+            question=question,
+            tokenizer=self.tokenizer,
+            generation_config=gen_dict,
+        )
+        if isinstance(out, (list, tuple)) and len(out) >= 1:
+            return out[0]
+        return out

preprocess.py ADDED Viewed

	@@ -0,0 +1,37 @@

+from PIL import Image
+import torchvision.transforms as T
+from torchvision.transforms.functional import InterpolationMode
+import torch
+IMAGENET_MEAN = (0.485, 0.456, 0.406)
+IMAGENET_STD  = (0.229, 0.224, 0.225)
+DEFAULT_INPUT_SIZE = 448
+def build_transform(input_size: int) -> T.Compose:
+    return T.Compose([
+        T.Lambda(lambda img: img.convert('RGB') if img.mode != 'RGB' else img),
+        T.Resize((input_size, input_size), interpolation=InterpolationMode.BILINEAR),
+        T.ToTensor(),
+        T.Normalize(mean=IMAGENET_MEAN, std=IMAGENET_STD),
+    ])
+def crop_regions(pil_img: Image.Image, head_ratio=0.28, foot_ratio=0.22):
+    w, h = pil_img.size
+    head_h = int(h * head_ratio)
+    foot_h = int(h * foot_ratio)
+    head = pil_img.crop((0, 0, w, head_h))
+    foot = pil_img.crop((0, h - foot_h, w, h))
+    body = pil_img.crop((0, head_h, w, h - foot_h))
+    return head, body, foot
+def crop_by_region(pil_img: Image.Image, region: str, head_ratio=0.28, foot_ratio=0.22) -> Image.Image:
+    r = (region or "full").lower()
+    if r == "full": return pil_img
+    head, body, foot = crop_regions(pil_img, head_ratio=head_ratio, foot_ratio=foot_ratio)
+    return {"head": head, "body": body, "foot": foot}.get(r, pil_img)
+def to_tensor_one_tile(pil_img: Image.Image, input_size=DEFAULT_INPUT_SIZE, pin_memory=False) -> torch.Tensor:
+    transform = build_transform(input_size=input_size)
+    t = transform(pil_img).unsqueeze(0)
+    if pin_memory: t = t.pin_memory()
+    return t

requirements.txt ADDED Viewed

Binary file (2.4 kB). View file

vintern_fast.py ADDED Viewed

	@@ -0,0 +1,201 @@

+import numpy as np
+import torch
+import torchvision.transforms as T
+from PIL import Image
+from torchvision.transforms.functional import InterpolationMode
+from transformers import AutoModel, AutoTokenizer
+import time
+import argparse
+import sys
+"""
+url: https://huggingface.co/5CD-AI/Vintern-1B-v3_5
+"""
+# Ensure UTF-8 console output (fixes UnicodeEncodeError on Windows PowerShell)
+try:
+    sys.stdout.reconfigure(encoding='utf-8')
+    sys.stderr.reconfigure(encoding='utf-8')
+except Exception:
+    pass
+# pip install ninja packaging wheel
+# pip install flash-attn --no-build-isolation
+# Khởi tạo timer
+start_time = time.time()
+# Chọn device (GPU nếu có)
+device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
+# Runtime backend optimizations
+torch.backends.cudnn.benchmark = True
+torch.backends.cuda.matmul.allow_tf32 = True
+torch.backends.cudnn.allow_tf32 = True
+print("Using device:", device)
+IMAGENET_MEAN = (0.485, 0.456, 0.406)
+IMAGENET_STD = (0.229, 0.224, 0.225)
+def build_transform(input_size):
+    return T.Compose([
+        T.Lambda(lambda img: img.convert('RGB') if img.mode != 'RGB' else img),
+        T.Resize((input_size, input_size), interpolation=InterpolationMode.BILINEAR),
+        T.ToTensor(),
+        T.Normalize(mean=IMAGENET_MEAN, std=IMAGENET_STD)
+    ])
+def find_closest_aspect_ratio(aspect_ratio, target_ratios, width, height, image_size):
+    best_ratio_diff = float('inf')
+    best_ratio = (1, 1)
+    area = width * height
+    for ratio in target_ratios:
+        target_aspect_ratio = ratio[0] / ratio[1]
+        ratio_diff = abs(aspect_ratio - target_aspect_ratio)
+        if ratio_diff < best_ratio_diff:
+            best_ratio_diff = ratio_diff
+            best_ratio = ratio
+        elif ratio_diff == best_ratio_diff:
+            if area > 0.5 * image_size * image_size * ratio[0] * ratio[1]:
+                best_ratio = ratio
+    return best_ratio
+def dynamic_preprocess(image, min_num=1, max_num=12, image_size=448, use_thumbnail=False):
+    orig_width, orig_height = image.size
+    aspect_ratio = orig_width / orig_height
+    target_ratios = set(
+        (i, j) for n in range(min_num, max_num + 1)
+        for i in range(1, n + 1)
+        for j in range(1, n + 1)
+        if i * j <= max_num and i * j >= min_num
+    )
+    target_ratios = sorted(target_ratios, key=lambda x: x[0] * x[1])
+    target_aspect_ratio = find_closest_aspect_ratio(
+        aspect_ratio, target_ratios, orig_width, orig_height, image_size)
+    target_width = image_size * target_aspect_ratio[0]
+    target_height = image_size * target_aspect_ratio[1]
+    blocks = target_aspect_ratio[0] * target_aspect_ratio[1]
+    resized_img = image.resize((target_width, target_height))
+    processed_images = []
+    for i in range(blocks):
+        box = (
+            (i % (target_width // image_size)) * image_size,
+            (i // (target_width // image_size)) * image_size,
+            ((i % (target_width // image_size)) + 1) * image_size,
+            ((i // (target_width // image_size)) + 1) * image_size
+        )
+        split_img = resized_img.crop(box)
+        processed_images.append(split_img)
+    assert len(processed_images) == blocks
+    if use_thumbnail and len(processed_images) != 1:
+        thumbnail_img = image.resize((image_size, image_size))
+        processed_images.append(thumbnail_img)
+    return processed_images
+def load_image(image_file, input_size=448, max_num=12, use_thumbnail=False, pin_memory=False):
+    image = Image.open(image_file).convert('RGB')
+    transform = build_transform(input_size=input_size)
+    # Fast path when using only one tile and no thumbnail
+    if max_num == 1 and not use_thumbnail:
+        pixel_values = transform(image).unsqueeze(0)
+    else:
+        images = dynamic_preprocess(image, image_size=input_size, use_thumbnail=use_thumbnail, max_num=max_num)
+        pixel_values = [transform(img) for img in images]
+        pixel_values = torch.stack(pixel_values)
+    if pin_memory:
+        pixel_values = pixel_values.pin_memory()
+    return pixel_values
+# Load model lên GPU
+model_load_start = time.time()
+model = AutoModel.from_pretrained(
+    "5CD-AI/Vintern-1B-v3_5",
+    torch_dtype=torch.float16,
+    low_cpu_mem_usage=True,
+    trust_remote_code=True,
+    use_flash_attn=True,   # nếu đã cài flash-attn có thể đổi thành True
+).to(device).eval()
+model_load_end = time.time()
+tokenizer = AutoTokenizer.from_pretrained(
+    "5CD-AI/Vintern-1B-v3_5",
+    trust_remote_code=True,
+    use_fast=False
+)
+def main():
+    parser = argparse.ArgumentParser()
+    parser.add_argument('--image', type=str, default=r'C:\Users\ADMIN\Downloads\vintern_api\imgs\6.TKngknhnCMC_00001.png')
+    parser.add_argument('--input_size', type=int, default=384)
+    parser.add_argument('--max_num', type=int, default=1)
+    parser.add_argument('--use_thumbnail', action='store_true', default=False)
+    parser.add_argument('--max_new_tokens', type=int, default=128)
+    parser.add_argument('--num_beams', type=int, default=1)
+    parser.add_argument('--do_sample', action='store_true', default=False)
+    parser.add_argument('--repetition_penalty', type=float, default=2.5)
+    parser.add_argument('--question', type=str, default='<image>\nTrích xuất thông tin chính trong ảnh và trả về dạng markdown.')
+    parser.add_argument('--compile', action='store_true', default=False)
+    args = parser.parse_args()
+    pin_mem = device.type == 'cuda'
+    # Validate input size for this model family (fallback to 448 if incompatible)
+    valid_input_size = args.input_size
+    try:
+        # Many InternVL/Vintern checkpoints expect 448 per tile
+        if args.input_size != 448:
+            print(f"[warn] input_size {args.input_size} may be incompatible; falling back to 448 for stability.")
+            valid_input_size = 448
+    except Exception:
+        valid_input_size = 448
+    # Image preprocessing and non-blocking GPU transfer
+    pixel_values = load_image(
+        args.image,
+        input_size=valid_input_size,
+        max_num=args.max_num,
+        use_thumbnail=args.use_thumbnail,
+        pin_memory=pin_mem
+    )
+    pixel_values = pixel_values.contiguous(memory_format=torch.channels_last)
+    pixel_values = pixel_values.to(device=device, dtype=torch.float16, non_blocking=True)
+    # Optional compile for speedup (PyTorch 2.x). Fallback silently if unsupported.
+    if args.compile:
+        try:
+            model_forward = model.forward
+            model.forward = torch.compile(model_forward, mode='reduce-overhead', fullgraph=False)  # type: ignore
+        except Exception:
+            pass
+    generation_config = dict(
+        max_new_tokens=args.max_new_tokens,
+        do_sample=args.do_sample,
+        num_beams=args.num_beams,
+        repetition_penalty=args.repetition_penalty
+    )
+    with torch.inference_mode():
+        response, history = model.chat(
+            tokenizer,
+            pixel_values,
+            args.question,
+            generation_config,
+            history=None,
+            return_history=True
+        )
+    print(f'User: {args.question}\nAssistant: {response}')
+    end_time = time.time()
+    print(f'Model load: {model_load_end - model_load_start:.2f}s  |  Total: {end_time - start_time:.2f}s')
+    del pixel_values
+    if device.type == 'cuda':
+        torch.cuda.empty_cache()
+if __name__ == '__main__':
+    main()