Spaces:

DeepMount00
/

Italian_OCR

Paused

App Files Files Community

DeepMount00 commited on May 15

Commit

5f2550e

verified ·

1 Parent(s): 68a6ee6

Create app.py

Browse files

Files changed (1) hide show

app.py +117 -0

app.py ADDED Viewed

	@@ -0,0 +1,117 @@

+import gradio as gr
+from transformers import AutoProcessor, AutoModelForVision2Seq
+import torch
+import re
+from PIL import Image
+import spaces  # Add spaces import for Hugging Face Spaces
+# Model information
+MODEL_ID = "DeepMount00/SmolVLM-Base-ocr_base"
+OCR_INSTRUCTION = "Sei un assistente esperto di OCR, converti il testo in formato MD."
+# Load processor and model
+processor = AutoProcessor.from_pretrained(MODEL_ID)
+model = AutoModelForVision2Seq.from_pretrained(
+    MODEL_ID,
+    torch_dtype=torch.bfloat16,
+).to("cuda")  # Ensure model loads on CUDA for Spaces
+@spaces.GPU  # Add spaces.GPU decorator for GPU acceleration
+def process_image(image, progress=gr.Progress()):
+    if image is None:
+        gr.Error("Please upload an image to process.")
+        return "Please upload an image to process."
+    progress(0, desc="Starting OCR processing...")
+    # Convert from Gradio's image format to PIL
+    if isinstance(image, str):
+        image = Image.open(image).convert("RGB")
+    progress(0.2, desc="Preparing image...")
+    # Create input messages - note that the instruction is included as part of the user message
+    messages = [
+        {
+            "role": "user",
+            "content": [
+                {"type": "image"},
+                {"type": "text", "text": OCR_INSTRUCTION}
+            ]
+        },
+    ]
+    # Prepare inputs
+    progress(0.4, desc="Processing with model...")
+    prompt = processor.apply_chat_template(messages, add_generation_prompt=True)
+    inputs = processor(text=prompt, images=[image], return_tensors="pt")
+    inputs = inputs.to('cuda')  # Move inputs to CUDA
+    # Generate outputs
+    progress(0.6, desc="Generating text...")
+    with torch.no_grad():
+        generated_ids = model.generate(
+            **inputs,
+            max_new_tokens=4096,
+            temperature=0.1
+        )
+    # Decode outputs
+    progress(0.8, desc="Finalizing results...")
+    generated_text = processor.batch_decode(
+        generated_ids,
+        skip_special_tokens=True
+    )[0]
+    # Extract only the assistant's response
+    # Remove any "User:" and "Assistant:" prefixes if present
+    cleaned_text = generated_text
+    # Remove user prompt and "User:" prefix if present
+    user_pattern = r"User:.*?(?=Assistant:|$)"
+    cleaned_text = re.sub(user_pattern, "", cleaned_text, flags=re.DOTALL)
+    # Remove "Assistant:" prefix if present
+    assistant_pattern = r"Assistant:\s*"
+    cleaned_text = re.sub(assistant_pattern, "", cleaned_text)
+    # Clean up any extra whitespace
+    cleaned_text = cleaned_text.strip()
+    progress(1.0, desc="Done!")
+    return cleaned_text  # Return only the cleaned text
+# Create Gradio interface
+with gr.Blocks() as demo:
+    gr.Markdown("# OCR to Markdown Converter")
+    gr.Markdown(
+        f"Upload an image containing text to convert it to Markdown format. This tool uses the {MODEL_ID} model with a fixed instruction: '{OCR_INSTRUCTION}'")
+    with gr.Row():
+        with gr.Column(scale=1):
+            input_image = gr.Image(type="pil", label="Upload an image containing text")
+            submit_btn = gr.Button("Process Image", variant="primary")
+        with gr.Column(scale=1):
+            output_text = gr.Textbox(label="Raw Text", lines=15)
+            copy_btn = gr.Button("Select All Text", variant="secondary")
+    submit_btn.click(
+        fn=process_image,
+        inputs=input_image,
+        outputs=output_text,
+        show_progress="full",
+        queue=True  # Enable queue for Spaces
+    )
+    def copy_to_clipboard(text):
+        return text
+    copy_btn.click(
+        fn=copy_to_clipboard,
+        inputs=output_text,
+        outputs=output_text
+    )
+# Launch the app with default Spaces configuration (no need for local file paths)
+demo.launch()