Spaces:
Sleeping
Sleeping
import os | |
import json | |
import re | |
import hashlib | |
import gc | |
from io import BytesIO | |
from collections import OrderedDict | |
from PIL import Image, UnidentifiedImageError | |
import torch | |
from transformers import AutoProcessor, BitsAndBytesConfig | |
from transformers.models.qwen2_5_vl import Qwen2_5_VLForConditionalGeneration | |
from pdf2image import convert_from_bytes | |
import gradio as gr | |
import fitz | |
# --- CONFIGURATION --- | |
MODEL_ID = "prithivMLmods/Camel-Doc-OCR-062825" | |
CACHE_MAX_SIZE = 128 | |
DPI = 300 # Giữ vừa đủ, không quá cao | |
IMAGE_MAX_DIM = None # Không resize nếu không cần | |
JPEG_QUALITY = 80 | |
GPU_MEMORY_FRACTION = 0.8 | |
# --- 1. Device --- | |
device = torch.device("cuda" if torch.cuda.is_available() else "cpu") | |
torch.backends.cudnn.benchmark = True | |
if device.type == 'cuda': | |
torch.cuda.set_per_process_memory_fraction(GPU_MEMORY_FRACTION, device=0) | |
# --- 2. Load model --- | |
# from transformers import AutoProcessor, BitsAndBytesConfig | |
# from transformers.models.qwen2_5_vl import Qwen2_5_VLForConditionalGeneration | |
# bnb = BitsAndBytesConfig( | |
# load_in_4bit=True, | |
# bnb_4bit_use_double_quant=True, | |
# bnb_4bit_quant_type="nf4", | |
# bnb_4bit_compute_dtype=torch.float16 | |
# ) | |
# processor = AutoProcessor.from_pretrained(MODEL_ID, trust_remote_code=True) | |
# model = Qwen2_5_VLForConditionalGeneration.from_pretrained( | |
# MODEL_ID, | |
# quantization_config=bnb, | |
# device_map="auto", | |
# trust_remote_code=True | |
# ).eval() | |
# processor.tokenizer.pad_token_id = processor.tokenizer.eos_token_id | |
# --- 8. File handler --- | |
import traceback | |
from concurrent.futures import ThreadPoolExecutor | |
# def handle_file(file, prompt, extra_prompt, max_new_tokens, progress=gr.Progress()): | |
# try: | |
# file_path = file.name if hasattr(file, "name") else file | |
# filename = os.path.basename(file_path) | |
# ext = filename.lower().split('.')[-1] | |
# full_prompt = (prompt + "\n" + extra_prompt).strip() or "" | |
# print(f"[INFO] handle_file → {filename} (.{ext})") | |
# if ext == "pdf": | |
# try: | |
# with open(file_path, "rb") as f: | |
# pdf_bytes = f.read() | |
# print(f"[INFO] Read PDF bytes: {len(pdf_bytes)} bytes") | |
# doc = fitz.open(stream=pdf_bytes, filetype="pdf") | |
# pages = [] | |
# zoom = DPI | |
# mat = fitz.Matrix(zoom, zoom) | |
# for i, page in enumerate(doc): | |
# pix = page.get_pixmap(matrix=mat, colorspace=fitz.csRGB) | |
# img = Image.frombytes("RGB", [pix.width, pix.height], pix.samples) | |
# if max(img.size) > 3072: | |
# img.thumbnail((3072, 3072), Image.Resampling.LANCZOS) | |
# pages.append(img) | |
# print(f"[INFO] Converted PDF → {len(pages)} pages") | |
# except Exception as e: | |
# traceback.print_exc() | |
# return filename, f"[ERROR] PDF conversion failed: {e}" | |
# outputs = [] | |
# with ThreadPoolExecutor(max_workers=4) as executor: | |
# futures = [executor.submit(run_inference, img, full_prompt, max_new_tokens) for img in pages] | |
# for idx, future in enumerate(futures): | |
# try: | |
# out = future.result() | |
# except Exception as e: | |
# traceback.print_exc() | |
# out = f"[ERROR] Inference page {idx+1} failed: {e}" | |
# outputs.append(out) | |
# progress((idx) / len(pages), desc=f"Page {idx+1}/{len(pages)}") | |
# result = "\n\n--- Page Break ---\n\n".join(outputs) | |
# print("[INFO] handle_file done") | |
# return filename, result | |
# else: | |
# try: | |
# img = Image.open(file_path) | |
# print(f"[INFO] Opened image: {img.mode}, {img.size}") | |
# except Exception as e: | |
# traceback.print_exc() | |
# return filename, f"[ERROR] Image open failed: {e}" | |
# return filename, run_inference(img, full_prompt, max_new_tokens) | |
# except Exception as e: | |
# traceback.print_exc() | |
# return "error", f"[ERROR] handle_file unexpected: {e}" | |
import time | |
import time | |
from concurrent.futures import ThreadPoolExecutor, as_completed | |
def handle_file(file, prompt, extra_prompt, max_new_tokens, progress=None): | |
try: | |
file_path = file.name if hasattr(file, "name") else file | |
filename = os.path.basename(file_path) | |
ext = filename.lower().split('.')[-1] | |
full_prompt = (prompt + "\n" + extra_prompt).strip() or "" | |
start_total = time.perf_counter() | |
if ext == "pdf": | |
# --- Chuyển PDF sang ảnh --- | |
start_convert = time.perf_counter() | |
with open(file_path, "rb") as f: | |
pdf_bytes = f.read() | |
doc = fitz.open(stream=pdf_bytes, filetype="pdf") | |
scale = DPI / 72 | |
mat = fitz.Matrix(scale, scale) | |
pages = [] | |
for page in doc: | |
pix = page.get_pixmap(matrix=mat, colorspace=fitz.csRGB) | |
img = Image.frombytes("RGB", [pix.width, pix.height], pix.samples) | |
if max(img.size) > 3072: | |
img.thumbnail((3072, 3072), Image.Resampling.LANCZOS) | |
pages.append(img) | |
end_convert = time.perf_counter() | |
# --- Inference từng trang --- | |
start_infer = time.perf_counter() | |
outputs = [] | |
for idx, img in enumerate(pages): | |
out = run_inference(img, full_prompt, max_new_tokens) | |
outputs.append(out) | |
if progress: | |
progress(idx / len(pages), desc=f"Page {idx+1}/{len(pages)}") | |
end_infer = time.perf_counter() | |
# --- Tổng hợp kết quả --- | |
combined_text = "\n\n".join(outputs) # hoặc json.dumps(outputs) tuỳ mục đích | |
# --- Gọi model tổng hợp cuối --- | |
start_agg = time.perf_counter() | |
final_result = run_inference_on_text(combined_text, max_new_tokens) # bạn cần implement hàm này | |
end_agg = time.perf_counter() | |
total_time = end_agg - start_total | |
return filename, ( | |
f"OKE (total time: {total_time:.3f}s, convert: {end_convert - start_convert:.3f}s, " | |
f"infer per page: {end_infer - start_infer:.3f}s, aggregate: {end_agg - start_agg:.3f}s)\n" | |
f"{final_result}" | |
) | |
else: | |
# Xử lý ảnh đơn | |
start_img = time.perf_counter() | |
img = Image.open(file_path) | |
if img.mode != "RGB": | |
img = img.convert("RGB") | |
end_img = time.perf_counter() | |
start_infer = time.perf_counter() | |
result = run_inference(img, full_prompt, max_new_tokens) | |
end_infer = time.perf_counter() | |
total_time = end_infer - start_img | |
return filename, f"OKE (time: {total_time:.3f}s)\n{result}" | |
except Exception as e: | |
import traceback | |
traceback.print_exc() | |
return "error", f"[ERROR] handle_file failed: {e}" | |
# def run_inference(img: Image.Image, prompt: str = "", max_new_tokens: int = 512) -> str: | |
# if img.mode != "RGB": | |
# img = img.convert("RGB") | |
# prompt_text = prompt.strip() | |
# messages = [{ | |
# "role": "user", | |
# "content": [ | |
# {"type": "image", "image": img}, | |
# {"type": "text", "text": prompt_text} | |
# ] | |
# }] | |
# text_prompt = processor.apply_chat_template( | |
# messages, tokenize=False, add_generation_prompt=True | |
# ) | |
# inputs = processor( | |
# text=[text_prompt], images=[img], return_tensors="pt", padding=True | |
# ).to(device) | |
# with torch.inference_mode(), torch.cuda.amp.autocast(enabled=(device.type == 'cuda')): | |
# gen = model.generate( | |
# **inputs, | |
# max_new_tokens=max_new_tokens, | |
# do_sample=False, | |
# eos_token_id=processor.tokenizer.eos_token_id | |
# ) | |
# trimmed = [o[len(i):] for i, o in zip(inputs['input_ids'], gen)] | |
# result = processor.tokenizer.batch_decode( | |
# trimmed, skip_special_tokens=True, clean_up_tokenization_spaces=True | |
# )[0].strip() | |
# return result | |
def run_inference(img: Image.Image, prompt: str = "", max_new_tokens: int = 512) -> str: | |
if img.mode != "RGB": | |
img = img.convert("RGB") | |
return f"[DEBUG] Ảnh nhận: size={img.size}, prompt='{prompt[:30]}...'" | |
def run_inference_on_text(img: Image.Image, prompt: str = "", max_new_tokens: int = 512) -> str: | |
return f"[DEBUG] Total..." | |
# --- 9. Prompt templates & JSON export --- | |
prompt_templates = { | |
"Electrolux": """Extract all structured information from the delivery order document image. | |
You must return the result as a valid XML block that strictly follows the structure below. | |
STRICT INSTRUCTIONS – read carefully and follow EXACTLY: | |
1. Return **ONLY** the XML block – nothing before or after it. | |
2. DO NOT add, remove, rename, or reorder any XML tags. | |
3. DO NOT include explanations, markdown, notes, comments, or extra spacing outside the XML block. | |
4. For every tag, fill in the exact value read from the image. | |
• NEVER copy or repeat the label/placeholder text. | |
• NEVER guess or invent values. | |
5. If a value is missing or unreadable, leave the tag EMPTY (e.g. <tag></tag>). | |
6. DO NOT include Vietnamese text or translations inside tag values. | |
7. The output MUST start with the root tag and end with its correct closing tag; all tags must be well-formed. | |
8. Dates must be in YYYY-MM-DD format. | |
9. Boolean tags must be exactly true or false (lower-case, no quotes). | |
✔ √ Yes Passed ⇒ true | ✘ X No Fail ⇒ false | |
10. **Inside each value** | |
• Replace every internal line-break with “, ” (comma + space). | |
• Trim leading/trailing whitespace. | |
• Escape XML special characters: & → &, < → <, > → >. | |
11. **Phone / contact fields** – digits, “+”, “–”, spaces only; if multiple numbers, separate with “, ”. | |
12. **Signature fields** – fill ONLY if the signature appears as legible text; if it is handwritten, leave the tag empty. | |
13. Ignore any information not represented by the tags below. | |
<s_electrolux_form> | |
<document_number>Số lệnh giao nhận hàng</document_number> | |
<order_number>Số đơn hàng</order_number> | |
<customer_code>Mã số khách hàng</customer_code> | |
<customer_order_code>Mã đơn khách hàng</customer_order_code> | |
<customer_order_date>Ngày đặt hàng của khách</customer_order_date> | |
<delivery_date>Ngày giao hàng</delivery_date> | |
<requested_delivery_date>Ngày giao hàng yêu cầu</requested_delivery_date> | |
<invoice_number>Số hóa đơn</invoice_number> | |
<shipper_company_name>Tên công ty gửi hàng</shipper_company_name> | |
<shipper_address>Địa chỉ gửi hàng</shipper_address> | |
<shipper_phone>Số điện thoại</shipper_phone> | |
<shipper_fax>Số fax</shipper_fax> | |
<shipper_tax_code>Mã số thuế</shipper_tax_code> | |
<consignee_customer_code>Mã khách hàng</consignee_customer_code> | |
<consignee_company_name>Tên công ty nhận hàng</consignee_company_name> | |
<shipping_address>Địa chỉ nhận hàng chi tiết</shipping_address> | |
<city_province>Tỉnh/Thành phố</city_province> | |
<postal_code>Mã bưu chính</postal_code> | |
<preparer_name>Họ tên người lập phiếu</preparer_name> | |
<preparer_date>Ngày lập phiếu</preparer_date> | |
<s_is_signed>Đã ký hay chưa (true hoặc false)</s_is_signed> | |
</s_electrolux_form> | |
""", | |
"Jotun": """Extract all structured information from the delivery order document. | |
You must return the result as a valid XML block that strictly follows the structure below. | |
STRICT INSTRUCTIONS – read carefully and follow EXACTLY: | |
1. Return **ONLY** the XML block – nothing before or after it. | |
2. DO NOT add, remove, rename, or reorder any XML tags. | |
3. DO NOT include explanations, markdown, notes, comments, or extra spacing outside the XML block. | |
4. For every tag, fill in the exact value read from the image. | |
• NEVER copy or repeat the label/placeholder text. | |
• NEVER guess or invent values. | |
5. If a value is missing or unreadable, leave the tag EMPTY (e.g. <tag></tag>). | |
6. DO NOT include Vietnamese text or translations inside tag values. | |
7. The output MUST start with the root tag and end with its correct closing tag; all tags must be well-formed. | |
8. Dates must be in YYYY-MM-DD format. | |
9. Boolean tags must be exactly true or false (lower-case, no quotes). | |
✔ √ Yes Passed ⇒ true | ✘ X No Fail ⇒ false | |
10. **Inside each value** | |
• Replace every internal line-break with “, ” (comma + space). | |
• Trim leading/trailing whitespace. | |
• Escape XML special characters: & → &, < → <, > → >. | |
11. **Phone / contact fields** – digits, “+”, “–”, spaces only; if multiple numbers, separate with “, ”. | |
12. **Signature fields** – fill ONLY if the signature appears as legible text; if it is handwritten, leave the tag empty. | |
13. Ignore any information not represented by the tags below. | |
<s_jotun_form> | |
<document_number>Số lệnh giao hàng</document_number> | |
<delivery_order_code>Số lệnh giao hàng số</delivery_order_code> | |
<customer_code>Mã khách hàng</customer_code> | |
<customer_name>Tên khách hàng</customer_name> | |
<customer_address>Địa chỉ khách hàng</customer_address> | |
<customer_phone>Điện thoại khách hàng</customer_phone> | |
<invoice_receiver_name>Tên người nhận hóa đơn</invoice_receiver_name> | |
<invoice_receiver_address>Địa chỉ người nhận hóa đơn</invoice_receiver_address> | |
<order_code>Số đơn đặt hàng</order_code> | |
<order_date>Ngày đặt hàng</order_date> | |
<order_number>Số đơn hàng</order_number> | |
<delivery_date>Ngày giao hàng</delivery_date> | |
<s_is_signed>Đã ký hay chưa (true hoặc false)</s_is_signed> | |
</s_jotun_form> | |
""", | |
"MAWB": """Extract all structured information from the Master Air Waybill (MAWB) document. | |
You must return the result as a valid XML block that strictly follows the structure below. | |
STRICT INSTRUCTIONS – read carefully and follow EXACTLY: | |
1. Return **ONLY** the XML block – nothing before or after it. | |
2. DO NOT add, remove, rename, or reorder any XML tags. | |
3. DO NOT include explanations, markdown, notes, comments, or extra spacing outside the XML block. | |
4. For every tag, fill in the exact value read from the image. | |
• NEVER copy or repeat the label/placeholder text. | |
• NEVER guess or invent values. | |
5. If a value is missing or unreadable, leave the tag EMPTY (e.g. <tag></tag>). | |
6. DO NOT include Vietnamese text or translations inside tag values. | |
7. The output MUST start with the root tag and end with its correct closing tag; all tags must be well-formed. | |
8. Dates must be in YYYY-MM-DD format. | |
9. Boolean tags must be exactly true or false (lower-case, no quotes). | |
✔ √ Yes Passed ⇒ true | ✘ X No Fail ⇒ false | |
10. **Inside each value** | |
• Replace every internal line-break with “, ” (comma + space). | |
• Trim leading/trailing whitespace. | |
• Escape XML special characters: & → &, < → <, > → >. | |
11. **Phone / contact fields** – digits, “+”, “–”, spaces only; if multiple numbers, separate with “, ”. | |
12. **Signature fields** – fill ONLY if the signature appears as legible text; if it is handwritten, leave the tag empty. | |
13. Ignore any information not represented by the tags below. | |
<s_mawb_form> | |
<air_waybill_number>Số MAWB</air_waybill_number> | |
<shipper_name>Tên người gửi hàng</shipper_name> | |
<shipper_address>Địa chỉ người gửi hàng</shipper_address> | |
<shipper_account_number>Mã tài khoản người gửi</shipper_account_number> | |
<consignee_name>Tên người nhận hàng</consignee_name> | |
<consignee_address>Địa chỉ người nhận hàng</consignee_address> | |
<consignee_account_number>Mã tài khoản người nhận</consignee_account_number> | |
<dangerous_goods_note>Ghi chú hàng nguy hiểm (true or false)</dangerous_goods_note> | |
<shipper_signature>Chữ ký người gửi</shipper_signature> | |
</s_mawb_form> | |
""", | |
"Phiếu Cân": """Extract all structured information from the document 'PHIẾU CÂN / SHIPPER’S LETTER OF INSTRUCTIONS'. | |
You must return the result as a valid XML block that strictly follows the structure below. | |
STRICT INSTRUCTIONS – read carefully and follow EXACTLY: | |
1. Return **ONLY** the XML block – nothing before or after it. | |
2. DO NOT add, remove, rename, or reorder any XML tags. | |
3. DO NOT include explanations, markdown, notes, comments, or extra spacing outside the XML block. | |
4. For every tag, fill in the exact value read from the image. | |
• NEVER copy or repeat the label/placeholder text. | |
• NEVER guess or invent values. | |
5. If a value is missing or unreadable, leave the tag EMPTY (e.g. <tag></tag>). | |
6. DO NOT include Vietnamese text or translations inside tag values. | |
7. The output MUST start with the root tag and end with its correct closing tag; all tags must be well-formed. | |
8. Dates must be in YYYY-MM-DD format. | |
9. Boolean tags must be exactly true or false (lower-case, no quotes). | |
✔ √ Yes Passed ⇒ true | ✘ X No Fail ⇒ false | |
10. **Inside each value** | |
• Replace every internal line-break with “, ” (comma + space). | |
• Trim leading/trailing whitespace. | |
• Escape XML special characters: & → &, < → <, > → >. | |
11. **Phone / contact fields** – digits, “+”, “–”, spaces only; if multiple numbers, separate with “, ”. | |
12. **Signature fields** – fill ONLY if the signature appears as legible text; if it is handwritten, leave the tag empty. | |
13. Ignore any information not represented by the tags below. | |
<s_weight_ticket> | |
<awb_number>Số AWB</awb_number> | |
<shipper_name>Tên người gửi hàng</shipper_name> | |
<shipper_address>Địa chỉ người gửi hàng</shipper_address> | |
<shipper_contact>Số điện thoại người gửi</shipper_contact> | |
<consignee_name>Tên người nhận hàng</consignee_name> | |
<consignee_address>Địa chỉ người nhận hàng</consignee_address> | |
<cargo_description>Tên hàng hóa</cargo_description> | |
<security_check_complete>Đã kiểm tra an ninh (true/false)</security_check_complete> | |
<acceptance_staff_name>Tên nhân viên tiếp nhận</acceptance_staff_name> | |
<acceptance_staff_signature>Chữ ký nhân viên tiếp nhận</acceptance_staff_signature> | |
</s_weight_ticket> | |
""", | |
"PC 3U": """Extract all structured information from the PC 3U air cargo instruction document. | |
You must return the result as a valid XML block that strictly follows the structure below. | |
STRICT INSTRUCTIONS – read carefully and follow EXACTLY: | |
1. Return **ONLY** the XML block – nothing before or after it. | |
2. DO NOT add, remove, rename, or reorder any XML tags. | |
3. DO NOT include explanations, markdown, notes, comments, or extra spacing outside the XML block. | |
4. For every tag, fill in the exact value read from the image. | |
• NEVER copy or repeat the label/placeholder text. | |
• NEVER guess or invent values. | |
5. If a value is missing or unreadable, leave the tag EMPTY (e.g. <tag></tag>). | |
6. DO NOT include Vietnamese text or translations inside tag values. | |
7. The output MUST start with the root tag and end with its correct closing tag; all tags must be well-formed. | |
8. Dates must be in YYYY-MM-DD format. | |
9. Boolean tags must be exactly true or false (lower-case, no quotes). | |
✔ √ Yes Passed ⇒ true | ✘ X No Fail ⇒ false | |
10. **Inside each value** | |
• Replace every internal line-break with “, ” (comma + space). | |
• Trim leading/trailing whitespace. | |
• Escape XML special characters: & → &, < → <, > → >. | |
11. **Phone / contact fields** – digits, “+”, “–”, spaces only; if multiple numbers, separate with “, ”. | |
12. **Signature fields** – fill ONLY if the signature appears as legible text; if it is handwritten, leave the tag empty. | |
13. Ignore any information not represented by the tags below. | |
<s_pc3u_form> | |
<awb_number>Số AWB</awb_number> | |
<cargo_service_code>Mã dịch vụ</cargo_service_code> | |
<shipper_name>Tên người gửi</shipper_name> | |
<shipper_address>Địa chỉ người gửi</shipper_address> | |
<shipper_contact>Thông tin liên hệ người gửi</shipper_contact> | |
<payer_name>Người thanh toán</payer_name> | |
<payer_tax_code>Mã số thuế người thanh toán</payer_tax_code> | |
<consignee_name>Tên người nhận</consignee_name> | |
<consignee_address>Địa chỉ người nhận</consignee_address> | |
<consignee_contact>Thông tin liên hệ người nhận</consignee_contact> | |
<shipper_signature>Chữ ký người gửi</shipper_signature> | |
<acceptance_staff_signature>Chữ ký nhân viên tiếp nhận</acceptance_staff_signature> | |
</s_pc3u_form> | |
""", | |
"SLIS-AVS DAD": """Extract all structured information from the document 'TỜ KHAI GỬI HÀNG - SHIPPER’S LETTER OF INSTRUCTION'. | |
You must return the result as a valid XML block that strictly follows the structure below. | |
STRICT INSTRUCTIONS – read carefully and follow EXACTLY: | |
1. Return **ONLY** the XML block – nothing before or after it. | |
2. DO NOT add, remove, rename, or reorder any XML tags. | |
3. DO NOT include explanations, markdown, notes, comments, or extra spacing outside the XML block. | |
4. For every tag, fill in the exact value read from the image. | |
• NEVER copy or repeat the label/placeholder text. | |
• NEVER guess or invent values. | |
5. If a value is missing or unreadable, leave the tag EMPTY (e.g. <tag></tag>). | |
6. DO NOT include Vietnamese text or translations inside tag values. | |
7. The output MUST start with the root tag and end with its correct closing tag; all tags must be well-formed. | |
8. Dates must be in YYYY-MM-DD format. | |
9. Boolean tags must be exactly true or false (lower-case, no quotes). | |
✔ √ Yes Passed ⇒ true | ✘ X No Fail ⇒ false | |
10. **Inside each value** | |
• Replace every internal line-break with “, ” (comma + space). | |
• Trim leading/trailing whitespace. | |
• Escape XML special characters: & → &, < → <, > → >. | |
11. **Phone / contact fields** – digits, “+”, “–”, spaces only; if multiple numbers, separate with “, ”. | |
12. **Signature fields** – fill ONLY if the signature appears as legible text; if it is handwritten, leave the tag empty. | |
13. Ignore any information not represented by the tags below. | |
<s_avs_dad> | |
<air_waybill_number>Số AWB</air_waybill_number> | |
<form_code>Mã biểu mẫu</form_code> | |
<shipper_name>Tên người gửi</shipper_name> | |
<shipper_address>Địa chỉ người gửi</shipper_address> | |
<shipper_phone>Điện thoại người gửi</shipper_phone> | |
<shipper_email>Email người gửi</shipper_email> | |
<shipper_tax_code>Mã số thuế người gửi</shipper_tax_code> | |
<consignee_name>Tên người nhận</consignee_name> | |
<consignee_address>Địa chỉ người nhận</consignee_address> | |
<consignee_phone>Điện thoại người nhận</consignee_phone> | |
<consignee_email>Email người nhận</consignee_email> | |
<departure_airport>Nơi đi</departure_airport> | |
<destination_airport>Nơi đến</destination_airport> | |
<acceptance_staff_name>Tên nhân viên tiếp nhận</acceptance_staff_name> | |
<acceptance_signature>Chữ ký nhân viên tiếp nhận</acceptance_signature> | |
<acceptance_time>Thời điểm tiếp nhận</acceptance_time> | |
<shipper_signature>Chữ ký người gửi</shipper_signature> | |
<shipper_signature_date>Ngày ký người gửi</shipper_signature_date> | |
</s_avs_dad> | |
""" | |
} | |
def insert_template(name): | |
return prompt_templates.get(name, "") | |
def sanitize_filename(name): | |
return re.sub(r'[^a-zA-Z0-9_\-\.]', '_', name) | |
def clean_text(text): | |
text = re.sub(r'<[^<> ]+?>', lambda m: m.group(0).strip(), text) | |
text = re.sub(r'<[^<>]+?>[^<>]*?<[^<>]+?>', lambda m: m.group(0).strip(), text) | |
return text.strip() | |
def export_json(image_name, result_text): | |
try: | |
clean_name = sanitize_filename(image_name) | |
content = {"image": image_name, "text_sequence": clean_text(result_text)} | |
path = f"/tmp/{clean_name}.json" | |
with open(path, "w", encoding="utf-8") as f: | |
json.dump(content, f, ensure_ascii=False, indent=2) | |
return path, json.dumps(content, ensure_ascii=False, indent=2) | |
except Exception as e: | |
return "", f"[Export JSON Failed]: {e}" | |
# --- 10. Gradio UI --- | |
css = """ | |
.gradio-textbox textarea { | |
font-size: 13px !important; | |
line-height: 1.3 !important; | |
padding: 6px 8px !important; | |
} | |
.gradio-textbox label { | |
font-size: 13px !important; | |
font-weight: 600 !important; | |
margin-bottom: 4px !important; | |
} | |
.gradio-button { | |
font-size: 12px !important; | |
padding: 4px 8px !important; | |
height: 28px !important; | |
min-height: 28px !important; | |
margin: 2px !important; | |
} | |
.gradio-button[data-variant="primary"] { | |
height: 36px !important; | |
font-size: 13px !important; | |
padding: 8px 16px !important; | |
} | |
.gradio-file { | |
font-size: 13px !important; | |
} | |
.gradio-file .file-upload { | |
padding: 8px !important; | |
min-height: 80px !important; | |
} | |
.gradio-markdown h3 { | |
font-size: 14px !important; | |
margin: 8px 0 4px 0 !important; | |
} | |
.gradio-markdown h2 { | |
font-size: 18px !important; | |
margin: 8px 0 !important; | |
} | |
.gradio-code { | |
font-size: 12px !important; | |
} | |
""" | |
with gr.Blocks(title="Camel-Doc-OCR", css=css) as demo: | |
gr.Markdown("## 🧾 Camel-Doc-OCR (Qwen2.5-VL, 4-bit)") | |
# --- Main Layout: 2 Columns --- | |
with gr.Row(): | |
# === LEFT COLUMN: Input === | |
with gr.Column(scale=1): | |
gr.Markdown("### 📥 INPUT") | |
# File Input | |
# file_input = gr.File( | |
# label="📤 Tải ảnh hoặc PDF", | |
# file_types=[".jpg", ".jpeg", ".png", ".pdf"], | |
# height=100 | |
# ) | |
# Cho phép tất cả file: | |
file_input = gr.File( | |
label="📤 Tải ảnh hoặc PDF", | |
file_types=None, # ← fix ở đây | |
height=100 | |
) | |
# Prompt Input | |
prompt_input = gr.Textbox( | |
label="Prompt thuần", | |
lines=2, | |
placeholder="Nhập prompt tùy chỉnh...", | |
max_lines=3 | |
) | |
# JSON Config | |
config_input = gr.Textbox( | |
label="JSON Prompt", | |
lines=6, | |
placeholder="Cấu hình JSON sẽ xuất hiện ở đây...", | |
max_lines=8 | |
) | |
# Max New Tokens Radio | |
max_new_tokens_input = gr.Radio( | |
choices=[128, 256, 512, 1024, 1536, 2048], | |
value=512, | |
label="🔢 Chọn max_new_tokens (giới hạn độ dài đầu ra)", | |
info="Chọn độ dài tối đa cho đầu ra của mô hình" | |
) | |
# Prompt Templates | |
gr.Markdown("### 📑 Mẫu:") | |
with gr.Row(): | |
for key in list(prompt_templates.keys()): # All buttons in one row | |
gr.Button(f"{key}", size="sm", scale=1).click( | |
fn=lambda *, k=key: insert_template(k), | |
inputs=[], | |
outputs=config_input | |
) | |
# Run Button | |
run_btn = gr.Button("🚀 Chạy OCR", variant="primary") | |
# === RIGHT COLUMN: Output === | |
with gr.Column(scale=1): | |
gr.Markdown("### 📤 OUTPUT") | |
# Result Output | |
result_output = gr.Textbox( | |
label="Kết quả trích xuất", | |
lines=10, | |
placeholder="Kết quả sẽ hiển thị ở đây sau khi chạy OCR...", | |
max_lines=12 | |
) | |
# Export Section | |
with gr.Row(): | |
export_btn = gr.Button("📦 Xuất JSON", visible=False, variant="secondary", size="sm") | |
# JSON Output | |
json_text = gr.Code( | |
label="JSON Output", | |
language="json", | |
lines=6, | |
visible=False | |
) | |
# Download File | |
json_file = gr.File( | |
label="File JSON để tải", | |
visible=False, | |
file_types=[".json"] | |
) | |
# --- Hidden Fields --- | |
hidden_name = gr.Textbox(visible=False) | |
# --- Event Handlers --- | |
# Run Inference | |
run_btn.click( | |
fn=handle_file, | |
inputs=[file_input, prompt_input, config_input, max_new_tokens_input], | |
outputs=[hidden_name, result_output] | |
) | |
# Export JSON | |
export_btn.click( | |
fn=export_json, | |
inputs=[hidden_name, result_output], | |
outputs=[json_file, json_text] | |
) | |
export_btn.click(fn=lambda: gr.update(visible=True), outputs=[json_file]) | |
export_btn.click(fn=lambda: gr.update(visible=True), outputs=[json_text]) | |
if __name__ == "__main__": | |
demo.launch() |