File size: 3,774 Bytes
5f2550e
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
import gradio as gr
from transformers import AutoProcessor, AutoModelForVision2Seq
import torch
import re
from PIL import Image
import spaces  # Add spaces import for Hugging Face Spaces

# Model information
MODEL_ID = "DeepMount00/SmolVLM-Base-ocr_base"
OCR_INSTRUCTION = "Sei un assistente esperto di OCR, converti il testo in formato MD."

# Load processor and model
processor = AutoProcessor.from_pretrained(MODEL_ID)
model = AutoModelForVision2Seq.from_pretrained(
    MODEL_ID,
    torch_dtype=torch.bfloat16,
).to("cuda")  # Ensure model loads on CUDA for Spaces

@spaces.GPU  # Add spaces.GPU decorator for GPU acceleration
def process_image(image, progress=gr.Progress()):
    if image is None:
        gr.Error("Please upload an image to process.")
        return "Please upload an image to process."

    progress(0, desc="Starting OCR processing...")

    # Convert from Gradio's image format to PIL
    if isinstance(image, str):
        image = Image.open(image).convert("RGB")

    progress(0.2, desc="Preparing image...")

    # Create input messages - note that the instruction is included as part of the user message
    messages = [
        {
            "role": "user",
            "content": [
                {"type": "image"},
                {"type": "text", "text": OCR_INSTRUCTION}
            ]
        },
    ]

    # Prepare inputs
    progress(0.4, desc="Processing with model...")
    prompt = processor.apply_chat_template(messages, add_generation_prompt=True)
    inputs = processor(text=prompt, images=[image], return_tensors="pt")
    inputs = inputs.to('cuda')  # Move inputs to CUDA

    # Generate outputs
    progress(0.6, desc="Generating text...")
    with torch.no_grad():
        generated_ids = model.generate(
            **inputs,
            max_new_tokens=4096,
            temperature=0.1
        )

    # Decode outputs
    progress(0.8, desc="Finalizing results...")
    generated_text = processor.batch_decode(
        generated_ids,
        skip_special_tokens=True
    )[0]

    # Extract only the assistant's response
    # Remove any "User:" and "Assistant:" prefixes if present
    cleaned_text = generated_text

    # Remove user prompt and "User:" prefix if present
    user_pattern = r"User:.*?(?=Assistant:|$)"
    cleaned_text = re.sub(user_pattern, "", cleaned_text, flags=re.DOTALL)

    # Remove "Assistant:" prefix if present
    assistant_pattern = r"Assistant:\s*"
    cleaned_text = re.sub(assistant_pattern, "", cleaned_text)

    # Clean up any extra whitespace
    cleaned_text = cleaned_text.strip()

    progress(1.0, desc="Done!")
    return cleaned_text  # Return only the cleaned text


# Create Gradio interface
with gr.Blocks() as demo:
    gr.Markdown("# OCR to Markdown Converter")
    gr.Markdown(
        f"Upload an image containing text to convert it to Markdown format. This tool uses the {MODEL_ID} model with a fixed instruction: '{OCR_INSTRUCTION}'")

    with gr.Row():
        with gr.Column(scale=1):
            input_image = gr.Image(type="pil", label="Upload an image containing text")
            submit_btn = gr.Button("Process Image", variant="primary")
        with gr.Column(scale=1):
            output_text = gr.Textbox(label="Raw Text", lines=15)
            copy_btn = gr.Button("Select All Text", variant="secondary")

    submit_btn.click(
        fn=process_image,
        inputs=input_image,
        outputs=output_text,
        show_progress="full",
        queue=True  # Enable queue for Spaces
    )

    def copy_to_clipboard(text):
        return text

    copy_btn.click(
        fn=copy_to_clipboard,
        inputs=output_text,
        outputs=output_text
    )

# Launch the app with default Spaces configuration (no need for local file paths)
demo.launch()