axiilay commited on
Commit
3da4f0d
·
1 Parent(s): 29a03ec

Add DeepSeek-OCR Gradio application files

Browse files
Files changed (3) hide show
  1. README.md +34 -5
  2. app.py +141 -0
  3. requirements.txt +15 -0
README.md CHANGED
@@ -1,14 +1,43 @@
1
  ---
2
  title: DeepSeek OCR Demo
3
- emoji: 🏃
4
- colorFrom: gray
5
  colorTo: red
6
  sdk: gradio
7
- sdk_version: 5.49.1
8
  app_file: app.py
9
  pinned: false
10
  license: mit
11
- short_description: 'An interactive demo for the DeepSeek-OCR '
12
  ---
13
 
14
- Check out the configuration reference at https://huggingface.co/docs/hub/spaces-config-reference
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
  ---
2
  title: DeepSeek OCR Demo
3
+ emoji: 🖼
4
+ colorFrom: purple
5
  colorTo: red
6
  sdk: gradio
7
+ sdk_version: 5.44.0
8
  app_file: app.py
9
  pinned: false
10
  license: mit
11
+ short_description: An interactive demo for the DeepSeek-OCR model.
12
  ---
13
 
14
+ # DeepSeek-OCR Document Recognition
15
+
16
+ This Space uses the DeepSeek-OCR model for document text recognition and extraction.
17
+
18
+ ## Features
19
+
20
+ - Multiple model size options (Tiny to Large)
21
+ - Free OCR and Markdown conversion
22
+ - Support for various document types
23
+ - Powered by ZeroGPU for efficient inference
24
+
25
+ ## Usage
26
+
27
+ 1. Upload an image containing text
28
+ 2. Select model size (Gundam recommended for documents)
29
+ 3. Choose task type
30
+ 4. Click "Process Image"
31
+
32
+ ## Model Sizes
33
+
34
+ - **Tiny**: 512x512, fastest
35
+ - **Small**: 640x640, good balance
36
+ - **Base**: 1024x1024, high quality
37
+ - **Large**: 1280x1280, best quality
38
+ - **Gundam**: Optimized for documents with crop mode
39
+
40
+ ## Credits
41
+
42
+ Model: [deepseek-ai/DeepSeek-OCR](https://huggingface.co/deepseek-ai/DeepSeek-OCR)
43
+
app.py ADDED
@@ -0,0 +1,141 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import gradio as gr
2
+ import torch
3
+ from transformers import AutoModel, AutoTokenizer
4
+ import spaces
5
+ import os
6
+ import tempfile
7
+
8
+ # Load model and tokenizer
9
+ model_name = "deepseek-ai/DeepSeek-OCR"
10
+ tokenizer = AutoTokenizer.from_pretrained(model_name, trust_remote_code=True)
11
+ model = AutoModel.from_pretrained(
12
+ model_name,
13
+ _attn_implementation="flash_attention_2",
14
+ trust_remote_code=True,
15
+ use_safetensors=True,
16
+ )
17
+ model = model.eval().to(torch.bfloat16)
18
+
19
+
20
+ @spaces.GPU
21
+ def process_image(image, model_size, task_type):
22
+ """
23
+ Process image with DeepSeek-OCR
24
+
25
+ Args:
26
+ image: PIL Image or file path
27
+ model_size: Model size configuration
28
+ task_type: OCR task type
29
+ """
30
+ # Create temporary directory for output
31
+ with tempfile.TemporaryDirectory() as output_path:
32
+ # Set prompt based on task type
33
+ if task_type == "Free OCR":
34
+ prompt = "<image>\nFree OCR. "
35
+ elif task_type == "Convert to Markdown":
36
+ prompt = "<image>\n<|grounding|>Convert the document to markdown. "
37
+ elif task_type == "Extract Text":
38
+ prompt = "<image>\nExtract all text from the image. "
39
+ else:
40
+ prompt = "<image>\nFree OCR. "
41
+
42
+ # Save uploaded image temporarily
43
+ temp_image_path = os.path.join(output_path, "temp_image.jpg")
44
+ image.save(temp_image_path)
45
+
46
+ # Configure model size parameters
47
+ size_configs = {
48
+ "Tiny": {"base_size": 512, "image_size": 512, "crop_mode": False},
49
+ "Small": {"base_size": 640, "image_size": 640, "crop_mode": False},
50
+ "Base": {"base_size": 1024, "image_size": 1024, "crop_mode": False},
51
+ "Large": {"base_size": 1280, "image_size": 1280, "crop_mode": False},
52
+ "Gundam (Recommended)": {
53
+ "base_size": 1024,
54
+ "image_size": 640,
55
+ "crop_mode": True,
56
+ },
57
+ }
58
+
59
+ config = size_configs.get(model_size, size_configs["Gundam (Recommended)"])
60
+
61
+ # Run inference
62
+ result = model.infer(
63
+ tokenizer,
64
+ prompt=prompt,
65
+ image_file=temp_image_path,
66
+ output_path=output_path,
67
+ base_size=config["base_size"],
68
+ image_size=config["image_size"],
69
+ crop_mode=config["crop_mode"],
70
+ save_results=True,
71
+ test_compress=True,
72
+ )
73
+
74
+ return result
75
+
76
+
77
+ # Create Gradio interface
78
+ with gr.Blocks(title="DeepSeek-OCR") as demo:
79
+ gr.Markdown(
80
+ """
81
+ # DeepSeek-OCR Document Recognition
82
+
83
+ Upload an image to extract text using DeepSeek-OCR model.
84
+ Supports various document types and handwriting recognition.
85
+
86
+ **Model Sizes:**
87
+ - **Tiny**: Fastest, lower accuracy (512x512)
88
+ - **Small**: Fast, good accuracy (640x640)
89
+ - **Base**: Balanced performance (1024x1024)
90
+ - **Large**: Best accuracy, slower (1280x1280)
91
+ - **Gundam (Recommended)**: Optimized for documents (1024 base, 640 image, crop mode)
92
+ """
93
+ )
94
+
95
+ with gr.Row():
96
+ with gr.Column():
97
+ image_input = gr.Image(
98
+ type="pil", label="Upload Image", sources=["upload", "clipboard"]
99
+ )
100
+
101
+ model_size = gr.Dropdown(
102
+ choices=["Tiny", "Small", "Base", "Large", "Gundam (Recommended)"],
103
+ value="Gundam (Recommended)",
104
+ label="Model Size",
105
+ )
106
+
107
+ task_type = gr.Dropdown(
108
+ choices=["Free OCR", "Convert to Markdown", "Extract Text"],
109
+ value="Convert to Markdown",
110
+ label="Task Type",
111
+ )
112
+
113
+ submit_btn = gr.Button("Process Image", variant="primary")
114
+
115
+ with gr.Column():
116
+ output_text = gr.Textbox(
117
+ label="OCR Result", lines=20, show_copy_button=True
118
+ )
119
+
120
+ # Examples
121
+ gr.Examples(
122
+ examples=[
123
+ ["examples/math.png", "Gundam (Recommended)", "Convert to Markdown"],
124
+ ["examples/receipt.jpg", "Base", "Free OCR"],
125
+ ],
126
+ inputs=[image_input, model_size, task_type],
127
+ outputs=output_text,
128
+ fn=process_image,
129
+ cache_examples=False,
130
+ )
131
+
132
+ submit_btn.click(
133
+ fn=process_image,
134
+ inputs=[image_input, model_size, task_type],
135
+ outputs=output_text,
136
+ )
137
+
138
+ # Launch the app
139
+ if __name__ == "__main__":
140
+ demo.queue(max_size=20)
141
+ demo.launch()
requirements.txt ADDED
@@ -0,0 +1,15 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ torch==2.6.0
2
+ transformers==4.46.3
3
+ tokenizers==0.20.3
4
+ einops
5
+ addict
6
+ easydict
7
+ gradio>=4.0.0
8
+ spaces>=0.20.0
9
+ Pillow>=10.0.0
10
+ safetensors>=0.4.0
11
+ accelerate>=0.24.0
12
+ sentencepiece>=0.1.99
13
+ protobuf>=3.20.0
14
+ torchvision
15
+ flash-attn @ https://github.com/Dao-AILab/flash-attention/releases/download/v2.7.3/flash_attn-2.7.3+cu12torch2.6cxx11abiFALSE-cp310-cp310-linux_x86_64.whl