Spaces:
Running
Running
# app.py | |
import gradio as gr | |
from PIL import Image | |
from transformers import VisionEncoderDecoderModel, TrOCRProcessor | |
import torch | |
print("--- Initializing Solver Service ---") | |
# Use a GPU if available (Hugging Face may provide one) | |
device = "cuda" if torch.cuda.is_available() else "cpu" | |
# --- LOAD MODELS ONLY ONCE AT STARTUP --- | |
print("1. Loading TrOCR processor...") | |
processor = TrOCRProcessor.from_pretrained("anuashok/ocr-captcha-v3", use_fast=True) | |
print(" - Processor loaded.") | |
print("2. Loading VisionEncoderDecoder model...") | |
model = VisionEncoderDecoderModel.from_pretrained("anuashok/ocr-captcha-v3").to(device) | |
print(" - Model loaded.") | |
print(f"--- Model is running on: {device.upper()} ---") | |
# --- END OF HEAVY LOADING --- | |
def solve_captcha(input_image: Image.Image) -> str: | |
""" | |
Solves a CAPTCHA using the pre-loaded model. | |
This function uses the exact image processing logic from your original script. | |
""" | |
print("--- Received image for solving ---") | |
# 1. Convert input image to RGBA (as in your original code) | |
image = input_image.convert("RGBA") | |
# 2. Prepare a white background | |
background = Image.new("RGBA", image.size, (255, 255, 255)) | |
# 3. Composite the image onto the white background and convert to RGB | |
combined = Image.alpha_composite(background, image).convert("RGB") | |
print(" - Image pre-processing complete.") | |
# 4. Prepare image for the model | |
pixel_values = processor(images=combined, return_tensors="pt").pixel_values.to(device) | |
print(" - Image prepared for model.") | |
# 5. Run model inference | |
generated_ids = model.generate(pixel_values) | |
print(" - Model inference complete.") | |
# 6. Decode the result | |
generated_text = processor.batch_decode(generated_ids, skip_special_tokens=True)[0] | |
print(f" - Decoding complete. Result: {generated_text}") | |
return generated_text | |
# --- Create the Gradio Interface and API Endpoint --- | |
gr.Interface( | |
fn=solve_captcha, | |
inputs=gr.Image(type="pil", label="Upload CAPTCHA Image"), | |
outputs=gr.Textbox(label="Result"), | |
title="TrOCR CAPTCHA Solver (Custom Logic)", | |
description="An API for the anuashok/ocr-captcha-v3 model using specific pre-processing." | |
).launch() |