Spaces:

DeepMount00
/

Italian_OCR

Running on Zero

App Files Files Community

Italian_OCR / app.py

DeepMount00

Create app.py

5f2550e verified 28 days ago

raw

history blame

3.77 kB

	import gradio as gr
	from transformers import AutoProcessor, AutoModelForVision2Seq
	import torch
	import re
	from PIL import Image
	import spaces # Add spaces import for Hugging Face Spaces

	# Model information
	MODEL_ID = "DeepMount00/SmolVLM-Base-ocr_base"
	OCR_INSTRUCTION = "Sei un assistente esperto di OCR, converti il testo in formato MD."

	# Load processor and model
	processor = AutoProcessor.from_pretrained(MODEL_ID)
	model = AutoModelForVision2Seq.from_pretrained(
	MODEL_ID,
	torch_dtype=torch.bfloat16,
	).to("cuda") # Ensure model loads on CUDA for Spaces

	@spaces.GPU # Add spaces.GPU decorator for GPU acceleration
	def process_image(image, progress=gr.Progress()):
	if image is None:
	gr.Error("Please upload an image to process.")
	return "Please upload an image to process."

	progress(0, desc="Starting OCR processing...")

	# Convert from Gradio's image format to PIL
	if isinstance(image, str):
	image = Image.open(image).convert("RGB")

	progress(0.2, desc="Preparing image...")

	# Create input messages - note that the instruction is included as part of the user message
	messages = [
	{
	"role": "user",
	"content": [
	{"type": "image"},
	{"type": "text", "text": OCR_INSTRUCTION}
	]
	},
	]

	# Prepare inputs
	progress(0.4, desc="Processing with model...")
	prompt = processor.apply_chat_template(messages, add_generation_prompt=True)
	inputs = processor(text=prompt, images=[image], return_tensors="pt")
	inputs = inputs.to('cuda') # Move inputs to CUDA

	# Generate outputs
	progress(0.6, desc="Generating text...")
	with torch.no_grad():
	generated_ids = model.generate(
	**inputs,
	max_new_tokens=4096,
	temperature=0.1
	)

	# Decode outputs
	progress(0.8, desc="Finalizing results...")
	generated_text = processor.batch_decode(
	generated_ids,
	skip_special_tokens=True
	)[0]

	# Extract only the assistant's response
	# Remove any "User:" and "Assistant:" prefixes if present
	cleaned_text = generated_text

	# Remove user prompt and "User:" prefix if present
	user_pattern = r"User:.*?(?=Assistant:\|$)"
	cleaned_text = re.sub(user_pattern, "", cleaned_text, flags=re.DOTALL)

	# Remove "Assistant:" prefix if present
	assistant_pattern = r"Assistant:\s*"
	cleaned_text = re.sub(assistant_pattern, "", cleaned_text)

	# Clean up any extra whitespace
	cleaned_text = cleaned_text.strip()

	progress(1.0, desc="Done!")
	return cleaned_text # Return only the cleaned text


	# Create Gradio interface
	with gr.Blocks() as demo:
	gr.Markdown("# OCR to Markdown Converter")
	gr.Markdown(
	f"Upload an image containing text to convert it to Markdown format. This tool uses the {MODEL_ID} model with a fixed instruction: '{OCR_INSTRUCTION}'")

	with gr.Row():
	with gr.Column(scale=1):
	input_image = gr.Image(type="pil", label="Upload an image containing text")
	submit_btn = gr.Button("Process Image", variant="primary")
	with gr.Column(scale=1):
	output_text = gr.Textbox(label="Raw Text", lines=15)
	copy_btn = gr.Button("Select All Text", variant="secondary")

	submit_btn.click(
	fn=process_image,
	inputs=input_image,
	outputs=output_text,
	show_progress="full",
	queue=True # Enable queue for Spaces
	)

	def copy_to_clipboard(text):
	return text

	copy_btn.click(
	fn=copy_to_clipboard,
	inputs=output_text,
	outputs=output_text
	)

	# Launch the app with default Spaces configuration (no need for local file paths)
	demo.launch()