DeepSeek-OCR-DEMO

Running on Zero

App Files Files Community

khang119966 commited on 9 days ago

Commit

f10b987

verified ·

1 Parent(s): 3fcdc10

Update app.py

Browse files

Files changed (1) hide show

app.py +72 -75

app.py CHANGED Viewed

@@ -6,12 +6,11 @@ import os
 import tempfile
 from PIL import Image
-# --- Tải Model và Tokenizer (Chỉ một lần khi khởi động) ---
-# Di chuyển việc tải model ra ngoài để tránh tải lại mỗi lần gọi hàm
 print("Loading model and tokenizer...")
 model_name = "deepseek-ai/DeepSeek-OCR"
 tokenizer = AutoTokenizer.from_pretrained(model_name, trust_remote_code=True)
-# Tải model lên CPU trước, sau đó chuyển sang GPU trong hàm xử lý
 model = AutoModel.from_pretrained(
     model_name,
     _attn_implementation="flash_attention_2",
@@ -19,51 +18,50 @@ model = AutoModel.from_pretrained(
     use_safetensors=True,
 )
 model = model.eval()
-print("Model loaded successfully.")
-# --- Hàm xử lý chính ---
 @spaces.GPU
 def process_ocr_task(image, model_size, task_type, ref_text):
     """
-    Xử lý hình ảnh với DeepSeek-OCR cho tất cả các tác vụ.
     Args:
-        image: Đối tượng PIL Image
-        model_size: Cấu hình kích thước model
-        task_type: Loại tác vụ OCR
-        ref_text: Văn bản tham chiếu cho tác vụ 'Locate'
     """
     if image is None:
         return "Please upload an image first.", None
-    # Chuyển model sang GPU và định dạng bfloat16 để tối ưu hiệu suất
-    print("Moving model to GPU...")
     model_gpu = model.cuda().to(torch.bfloat16)
-    print("Model on GPU.")
-    # Tạo thư mục tạm thời để lưu trữ đầu ra
     with tempfile.TemporaryDirectory() as output_path:
-        # --- Xây dựng prompt dựa trên loại tác vụ ---
-        if task_type == "Free OCR":
             prompt = "<image>\nFree OCR."
-        elif task_type == "Convert to Markdown":
             prompt = "<image>\n<|grounding|>Convert the document to markdown."
-        elif task_type == "Parse Figure":
             prompt = "<image>\nParse the figure."
-        elif task_type == "Locate Object by Reference":
             if not ref_text or ref_text.strip() == "":
-                raise gr.Error("For 'Locate' task, please provide the reference text to find.")
-            # Sử dụng f-string để chèn văn bản tham chiếu vào prompt
             prompt = f"<image>\nLocate <|ref|>{ref_text.strip()}<|/ref|> in the image."
         else:
-            # Mặc định là Free OCR
-            prompt = "<image>\nFree OCR."
-        # Lưu ảnh được tải lên vào thư mục tạm
         temp_image_path = os.path.join(output_path, "temp_image.png")
         image.save(temp_image_path)
-        # Cấu hình các tham số kích thước model
         size_configs = {
             "Tiny": {"base_size": 512, "image_size": 512, "crop_mode": False},
             "Small": {"base_size": 640, "image_size": 640, "crop_mode": False},
@@ -73,8 +71,8 @@ def process_ocr_task(image, model_size, task_type, ref_text):
         }
         config = size_configs.get(model_size, size_configs["Gundam (Recommended)"])
-        print(f"Running inference with prompt: {prompt}")
-        # --- Chạy inference ---
         text_result = model_gpu.infer(
             tokenizer,
             prompt=prompt,
@@ -83,120 +81,119 @@ def process_ocr_task(image, model_size, task_type, ref_text):
             base_size=config["base_size"],
             image_size=config["image_size"],
             crop_mode=config["crop_mode"],
-            save_results=True,  # Quan trọng: phải lưu kết quả để lấy ảnh output
             test_compress=True,
             eval_mode=True,
         )
-        print(f"====\nText Result: {text_result}\n====")
-        # --- Xử lý output (văn bản và hình ảnh) ---
         image_result_path = None
-        # Tác vụ 'Locate' và 'Markdown' thường tạo ra ảnh kết quả có chữ 'grounding'
-        if task_type in ["Locate Object by Reference", "Convert to Markdown", "Parse Figure"]:
-            # Tìm file ảnh kết quả trong thư mục output
             for filename in os.listdir(output_path):
                 if "grounding" in filename or "result" in filename:
                     image_result_path = os.path.join(output_path, filename)
                     break
-        # Nếu tìm thấy ảnh, tải nó, nếu không trả về None
         result_image_pil = Image.open(image_result_path) if image_result_path else None
         return text_result, result_image_pil
-# --- Xây dựng giao diện Gradio ---
-with gr.Blocks(title="DeepSeek-OCR", theme=gr.themes.Soft()) as demo:
     gr.Markdown(
         """
-        # Demo toàn diện DeepSeek-OCR
-        Tải lên một hình ảnh để thử nghiệm các khả năng nhận dạng và hiểu tài liệu của DeepSeek-OCR.
-        **Hướng dẫn:**
-        1. Tải lên một hình ảnh.
-        2. Chọn **Model Size** phù hợp (Gundam được khuyến nghị cho tài liệu).
-        3. Chọn **Task Type**:
-            - **Free OCR**: Trích xuất văn bản thô.
-            - **Convert to Markdown**: Chuyển đổi tài liệu (giữ cấu trúc) sang định dạng Markdown.
-            - **Parse Figure**: Phân tích và trích xuất dữ liệu từ biểu đồ, hình vẽ.
-            - **Locate Object by Reference**: Tìm một đối tượng hoặc văn bản cụ thể trong ảnh. **Bạn cần nhập nội dung cần tìm vào ô "Reference Text" bên dưới.**
         """
     )
     with gr.Row():
         with gr.Column(scale=1):
-            image_input = gr.Image(type="pil", label="Tải ảnh lên", sources=["upload", "clipboard"])
             model_size = gr.Dropdown(
                 choices=["Tiny", "Small", "Base", "Large", "Gundam (Recommended)"],
                 value="Gundam (Recommended)",
-                label="Model Size",
             )
             task_type = gr.Dropdown(
-                choices=["Free OCR", "Convert to Markdown", "Parse Figure", "Locate Object by Reference"],
-                value="Convert to Markdown",
-                label="Task Type",
             )
-            # Ô nhập văn bản tham chiếu, ban đầu bị ẩn
             ref_text_input = gr.Textbox(
-                label="Reference Text (cho tác vụ Locate)",
-                placeholder="Ví dụ: the teacher, 11-2=, a red car...",
-                visible=False, # Ban đầu ẩn đi
             )
-            submit_btn = gr.Button("Xử lý", variant="primary")
         with gr.Column(scale=2):
-            output_text = gr.Textbox(label="Kết quả văn bản", lines=15, show_copy_button=True)
-            output_image = gr.Image(label="Kết quả hình ảnh (nếu có)", type="pil")
-    # --- Logic tương tác cho giao diện ---
     def toggle_ref_text_visibility(task):
-        # Nếu người dùng chọn 'Locate', hiển thị ô nhập văn bản
-        if task == "Locate Object by Reference":
             return gr.Textbox(visible=True)
         else:
             return gr.Textbox(visible=False)
-    # Khi dropdown 'task_type' thay đổi, gọi hàm để cập nhật trạng thái hiển thị của ô ref_text_input
     task_type.change(
         fn=toggle_ref_text_visibility,
         inputs=task_type,
         outputs=ref_text_input,
     )
-    # Khi nhấn nút submit
     submit_btn.click(
         fn=process_ocr_task,
         inputs=[image_input, model_size, task_type, ref_text_input],
         outputs=[output_text, output_image],
     )
-    # --- Các ví dụ minh họa ---
     gr.Examples(
         examples=[
-            ["./examples/doc_markdown.png", "Gundam (Recommended)", "Convert to Markdown", ""],
-            ["./examples/chart.png", "Gundam (Recommended)", "Parse Figure", ""],
-            ["./examples/teacher.png", "Base", "Locate Object by Reference", "the teacher"],
-            ["./examples/math_locate.png", "Small", "Locate Object by Reference", "11-2="],
-            ["./examples/receipt.jpg", "Base", "Free OCR", ""],
         ],
         inputs=[image_input, model_size, task_type, ref_text_input],
         outputs=[output_text, output_image],
         fn=process_ocr_task,
-        cache_examples=False, # Tắt cache để đảm bảo chạy lại mỗi lần click
     )
-# --- Khởi chạy ứng dụng ---
 if __name__ == "__main__":
-    # Tạo thư mục examples và tải ảnh ví dụ (nếu chưa có)
     if not os.path.exists("examples"):
         os.makedirs("examples")
-    # Bạn cần tự tải các file ảnh ví dụ vào thư mục "examples"
-    # Ví dụ: doc_markdown.png, chart.png, teacher.png, math_locate.png, receipt.jpg
     demo.queue(max_size=20)
-    demo.launch(share=True) # share=True để tạo link public

 import tempfile
 from PIL import Image
+# --- 1. Load Model and Tokenizer (Done only once at startup) ---
 print("Loading model and tokenizer...")
 model_name = "deepseek-ai/DeepSeek-OCR"
 tokenizer = AutoTokenizer.from_pretrained(model_name, trust_remote_code=True)
+# Load the model to CPU first; it will be moved to GPU during processing
 model = AutoModel.from_pretrained(
     model_name,
     _attn_implementation="flash_attention_2",
     use_safetensors=True,
 )
 model = model.eval()
+print("✅ Model loaded successfully.")
+# --- 2. Main Processing Function ---
 @spaces.GPU
 def process_ocr_task(image, model_size, task_type, ref_text):
     """
+    Processes an image with DeepSeek-OCR for all supported tasks.
     Args:
+        image (PIL.Image): The input image.
+        model_size (str): The model size configuration.
+        task_type (str): The type of OCR task to perform.
+        ref_text (str): The reference text for the 'Locate' task.
     """
     if image is None:
         return "Please upload an image first.", None
+    # Move the model to GPU and use bfloat16 for better performance
+    print("🚀 Moving model to GPU...")
     model_gpu = model.cuda().to(torch.bfloat16)
+    print("✅ Model is on GPU.")
+    # Create a temporary directory to store files
     with tempfile.TemporaryDirectory() as output_path:
+        # --- Build the prompt based on the selected task type ---
+        if task_type == "📝 Free OCR":
             prompt = "<image>\nFree OCR."
+        elif task_type == "📄 Convert to Markdown":
             prompt = "<image>\n<|grounding|>Convert the document to markdown."
+        elif task_type == "📈 Parse Figure":
             prompt = "<image>\nParse the figure."
+        elif task_type == "🔍 Locate Object by Reference":
             if not ref_text or ref_text.strip() == "":
+                raise gr.Error("For the 'Locate' task, you must provide the reference text to find!")
+            # Use an f-string to embed the user's reference text into the prompt
             prompt = f"<image>\nLocate <|ref|>{ref_text.strip()}<|/ref|> in the image."
         else:
+            prompt = "<image>\nFree OCR." # Default fallback
+        # Save the uploaded image to the temporary path
         temp_image_path = os.path.join(output_path, "temp_image.png")
         image.save(temp_image_path)
+        # Configure model size parameters
         size_configs = {
             "Tiny": {"base_size": 512, "image_size": 512, "crop_mode": False},
             "Small": {"base_size": 640, "image_size": 640, "crop_mode": False},
         }
         config = size_configs.get(model_size, size_configs["Gundam (Recommended)"])
+        print(f"🏃 Running inference with prompt: {prompt}")
+        # --- Run the model's inference method ---
         text_result = model_gpu.infer(
             tokenizer,
             prompt=prompt,
             base_size=config["base_size"],
             image_size=config["image_size"],
             crop_mode=config["crop_mode"],
+            save_results=True,  # Important: Must be True to get the output image
             test_compress=True,
             eval_mode=True,
         )
+        print(f"====\n📄 Text Result: {text_result}\n====")
+        # --- Handle the output (both text and image) ---
         image_result_path = None
+        # Tasks that generate a visual output usually create a 'grounding' or 'result' image
+        if task_type in ["🔍 Locate Object by Reference", "📄 Convert to Markdown", "📈 Parse Figure"]:
+            # Find the result image in the output directory
             for filename in os.listdir(output_path):
                 if "grounding" in filename or "result" in filename:
                     image_result_path = os.path.join(output_path, filename)
                     break
+        # If an image was found, open it with PIL; otherwise, return None
         result_image_pil = Image.open(image_result_path) if image_result_path else None
         return text_result, result_image_pil
+# --- 3. Build the Gradio Interface ---
+with gr.Blocks(title="🐳DeepSeek-OCR🐳", theme=gr.themes.Soft()) as demo:
     gr.Markdown(
         """
+        # 🐳 Full Demo of DeepSeek-OCR 🐳
+        Upload an image to explore the document recognition and understanding capabilities of DeepSeek-OCR.
+        **💡 How to use:**
+        1.  **Upload an image** using the upload box.
+        2.  Select a **Model Size**. `Gundam` is recommended for most documents for a good balance of speed and accuracy.
+        3.  Choose a **Task Type**:
+            - **📝 Free OCR**: Extracts raw text from the image. Best for simple text extraction.
+            - **📄 Convert to Markdown**: Converts the entire document into Markdown format, preserving structure like headers, lists, and tables.
+            - **📈 Parse Figure**: Analyzes and extracts structured data from charts, graphs, and geometric figures.
+            - **🔍 Locate Object by Reference**: Finds a specific object or piece of text in the image. You **must** type what you're looking for into the **"Reference Text"** box that appears.
         """
     )
     with gr.Row():
         with gr.Column(scale=1):
+            image_input = gr.Image(type="pil", label="🖼️ Upload Image", sources=["upload", "clipboard"])
             model_size = gr.Dropdown(
                 choices=["Tiny", "Small", "Base", "Large", "Gundam (Recommended)"],
                 value="Gundam (Recommended)",
+                label="⚙️ Model Size",
             )
             task_type = gr.Dropdown(
+                choices=["📝 Free OCR", "📄 Convert to Markdown", "📈 Parse Figure", "🔍 Locate Object by Reference"],
+                value="📄 Convert to Markdown",
+                label="🚀 Task Type",
             )
             ref_text_input = gr.Textbox(
+                label="📝 Reference Text (for Locate task)",
+                placeholder="e.g., the teacher, 11-2=, a red car...",
+                visible=False, # Initially hidden
             )
+            submit_btn = gr.Button("Process Image", variant="primary")
         with gr.Column(scale=2):
+            output_text = gr.Textbox(label="📄 Text Result", lines=15, show_copy_button=True)
+            output_image = gr.Image(label="🖼️ Image Result (if any)", type="pil")
+    # --- UI Interaction Logic ---
     def toggle_ref_text_visibility(task):
+        # If the user selects the 'Locate' task, make the reference textbox visible
+        if task == "🔍 Locate Object by Reference":
             return gr.Textbox(visible=True)
         else:
             return gr.Textbox(visible=False)
+    # When the 'task_type' dropdown changes, call the function to update the visibility
     task_type.change(
         fn=toggle_ref_text_visibility,
         inputs=task_type,
         outputs=ref_text_input,
     )
+    # Define what happens when the submit button is clicked
     submit_btn.click(
         fn=process_ocr_task,
         inputs=[image_input, model_size, task_type, ref_text_input],
         outputs=[output_text, output_image],
     )
+    # --- Example Images and Tasks ---
     gr.Examples(
         examples=[
+            ["./examples/doc_markdown.png", "Gundam (Recommended)", "📄 Convert to Markdown", ""],
+            ["./examples/chart.png", "Gundam (Recommended)", "📈 Parse Figure", ""],
+            ["./examples/teacher.png", "Base", "🔍 Locate Object by Reference", "the teacher"],
+            ["./examples/math_locate.png", "Small", "🔍 Locate Object by Reference", "11-2="],
+            ["./examples/receipt.jpg", "Base", "📝 Free OCR", ""],
         ],
         inputs=[image_input, model_size, task_type, ref_text_input],
         outputs=[output_text, output_image],
         fn=process_ocr_task,
+        cache_examples=False, # Disable caching to ensure examples run every time
     )
+# --- 4. Launch the App ---
 if __name__ == "__main__":
+    # Create an 'examples' directory if it doesn't exist
     if not os.path.exists("examples"):
         os.makedirs("examples")
+    # Please manually download the example images into the "examples" folder.
+    # e.g., doc_markdown.png, chart.png, teacher.png, math_locate.png, receipt.jpg
     demo.queue(max_size=20)
+    demo.launch(share=True) # Set share=True to create a public link