DeepMount00 commited on
Commit
5f2550e
·
verified ·
1 Parent(s): 68a6ee6

Create app.py

Browse files
Files changed (1) hide show
  1. app.py +117 -0
app.py ADDED
@@ -0,0 +1,117 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import gradio as gr
2
+ from transformers import AutoProcessor, AutoModelForVision2Seq
3
+ import torch
4
+ import re
5
+ from PIL import Image
6
+ import spaces # Add spaces import for Hugging Face Spaces
7
+
8
+ # Model information
9
+ MODEL_ID = "DeepMount00/SmolVLM-Base-ocr_base"
10
+ OCR_INSTRUCTION = "Sei un assistente esperto di OCR, converti il testo in formato MD."
11
+
12
+ # Load processor and model
13
+ processor = AutoProcessor.from_pretrained(MODEL_ID)
14
+ model = AutoModelForVision2Seq.from_pretrained(
15
+ MODEL_ID,
16
+ torch_dtype=torch.bfloat16,
17
+ ).to("cuda") # Ensure model loads on CUDA for Spaces
18
+
19
+ @spaces.GPU # Add spaces.GPU decorator for GPU acceleration
20
+ def process_image(image, progress=gr.Progress()):
21
+ if image is None:
22
+ gr.Error("Please upload an image to process.")
23
+ return "Please upload an image to process."
24
+
25
+ progress(0, desc="Starting OCR processing...")
26
+
27
+ # Convert from Gradio's image format to PIL
28
+ if isinstance(image, str):
29
+ image = Image.open(image).convert("RGB")
30
+
31
+ progress(0.2, desc="Preparing image...")
32
+
33
+ # Create input messages - note that the instruction is included as part of the user message
34
+ messages = [
35
+ {
36
+ "role": "user",
37
+ "content": [
38
+ {"type": "image"},
39
+ {"type": "text", "text": OCR_INSTRUCTION}
40
+ ]
41
+ },
42
+ ]
43
+
44
+ # Prepare inputs
45
+ progress(0.4, desc="Processing with model...")
46
+ prompt = processor.apply_chat_template(messages, add_generation_prompt=True)
47
+ inputs = processor(text=prompt, images=[image], return_tensors="pt")
48
+ inputs = inputs.to('cuda') # Move inputs to CUDA
49
+
50
+ # Generate outputs
51
+ progress(0.6, desc="Generating text...")
52
+ with torch.no_grad():
53
+ generated_ids = model.generate(
54
+ **inputs,
55
+ max_new_tokens=4096,
56
+ temperature=0.1
57
+ )
58
+
59
+ # Decode outputs
60
+ progress(0.8, desc="Finalizing results...")
61
+ generated_text = processor.batch_decode(
62
+ generated_ids,
63
+ skip_special_tokens=True
64
+ )[0]
65
+
66
+ # Extract only the assistant's response
67
+ # Remove any "User:" and "Assistant:" prefixes if present
68
+ cleaned_text = generated_text
69
+
70
+ # Remove user prompt and "User:" prefix if present
71
+ user_pattern = r"User:.*?(?=Assistant:|$)"
72
+ cleaned_text = re.sub(user_pattern, "", cleaned_text, flags=re.DOTALL)
73
+
74
+ # Remove "Assistant:" prefix if present
75
+ assistant_pattern = r"Assistant:\s*"
76
+ cleaned_text = re.sub(assistant_pattern, "", cleaned_text)
77
+
78
+ # Clean up any extra whitespace
79
+ cleaned_text = cleaned_text.strip()
80
+
81
+ progress(1.0, desc="Done!")
82
+ return cleaned_text # Return only the cleaned text
83
+
84
+
85
+ # Create Gradio interface
86
+ with gr.Blocks() as demo:
87
+ gr.Markdown("# OCR to Markdown Converter")
88
+ gr.Markdown(
89
+ f"Upload an image containing text to convert it to Markdown format. This tool uses the {MODEL_ID} model with a fixed instruction: '{OCR_INSTRUCTION}'")
90
+
91
+ with gr.Row():
92
+ with gr.Column(scale=1):
93
+ input_image = gr.Image(type="pil", label="Upload an image containing text")
94
+ submit_btn = gr.Button("Process Image", variant="primary")
95
+ with gr.Column(scale=1):
96
+ output_text = gr.Textbox(label="Raw Text", lines=15)
97
+ copy_btn = gr.Button("Select All Text", variant="secondary")
98
+
99
+ submit_btn.click(
100
+ fn=process_image,
101
+ inputs=input_image,
102
+ outputs=output_text,
103
+ show_progress="full",
104
+ queue=True # Enable queue for Spaces
105
+ )
106
+
107
+ def copy_to_clipboard(text):
108
+ return text
109
+
110
+ copy_btn.click(
111
+ fn=copy_to_clipboard,
112
+ inputs=output_text,
113
+ outputs=output_text
114
+ )
115
+
116
+ # Launch the app with default Spaces configuration (no need for local file paths)
117
+ demo.launch()