Spaces:

DeepMount00
/

Italian_OCR

Running on Zero

File size: 3,774 Bytes

5f2550e

import gradio as gr
from transformers import AutoProcessor, AutoModelForVision2Seq
import torch
import re
from PIL import Image
import spaces  # Add spaces import for Hugging Face Spaces

# Model information
MODEL_ID = "DeepMount00/SmolVLM-Base-ocr_base"
OCR_INSTRUCTION = "Sei un assistente esperto di OCR, converti il testo in formato MD."

# Load processor and model
processor = AutoProcessor.from_pretrained(MODEL_ID)
model = AutoModelForVision2Seq.from_pretrained(
    MODEL_ID,
    torch_dtype=torch.bfloat16,
).to("cuda")  # Ensure model loads on CUDA for Spaces

@spaces.GPU  # Add spaces.GPU decorator for GPU acceleration
def process_image(image, progress=gr.Progress()):
    if image is None:
        gr.Error("Please upload an image to process.")
        return "Please upload an image to process."

    progress(0, desc="Starting OCR processing...")

    # Convert from Gradio's image format to PIL
    if isinstance(image, str):
        image = Image.open(image).convert("RGB")

    progress(0.2, desc="Preparing image...")

    # Create input messages - note that the instruction is included as part of the user message
    messages = [
        {
            "role": "user",
            "content": [
                {"type": "image"},
                {"type": "text", "text": OCR_INSTRUCTION}
            ]
        },
    ]

    # Prepare inputs
    progress(0.4, desc="Processing with model...")
    prompt = processor.apply_chat_template(messages, add_generation_prompt=True)
    inputs = processor(text=prompt, images=[image], return_tensors="pt")
    inputs = inputs.to('cuda')  # Move inputs to CUDA

    # Generate outputs
    progress(0.6, desc="Generating text...")
    with torch.no_grad():
        generated_ids = model.generate(
            **inputs,
            max_new_tokens=4096,
            temperature=0.1
        )

    # Decode outputs
    progress(0.8, desc="Finalizing results...")
    generated_text = processor.batch_decode(
        generated_ids,
        skip_special_tokens=True
    )[0]

    # Extract only the assistant's response
    # Remove any "User:" and "Assistant:" prefixes if present
    cleaned_text = generated_text

    # Remove user prompt and "User:" prefix if present
    user_pattern = r"User:.*?(?=Assistant:|$)"
    cleaned_text = re.sub(user_pattern, "", cleaned_text, flags=re.DOTALL)

    # Remove "Assistant:" prefix if present
    assistant_pattern = r"Assistant:\s*"
    cleaned_text = re.sub(assistant_pattern, "", cleaned_text)

    # Clean up any extra whitespace
    cleaned_text = cleaned_text.strip()

    progress(1.0, desc="Done!")
    return cleaned_text  # Return only the cleaned text


# Create Gradio interface
with gr.Blocks() as demo:
    gr.Markdown("# OCR to Markdown Converter")
    gr.Markdown(
        f"Upload an image containing text to convert it to Markdown format. This tool uses the {MODEL_ID} model with a fixed instruction: '{OCR_INSTRUCTION}'")

    with gr.Row():
        with gr.Column(scale=1):
            input_image = gr.Image(type="pil", label="Upload an image containing text")
            submit_btn = gr.Button("Process Image", variant="primary")
        with gr.Column(scale=1):
            output_text = gr.Textbox(label="Raw Text", lines=15)
            copy_btn = gr.Button("Select All Text", variant="secondary")

    submit_btn.click(
        fn=process_image,
        inputs=input_image,
        outputs=output_text,
        show_progress="full",
        queue=True  # Enable queue for Spaces
    )

    def copy_to_clipboard(text):
        return text

    copy_btn.click(
        fn=copy_to_clipboard,
        inputs=output_text,
        outputs=output_text
    )

# Launch the app with default Spaces configuration (no need for local file paths)
demo.launch()