captcha-solving / app.py
AxleToe's picture
Create app.py
066a23d verified
# app.py
import gradio as gr
from PIL import Image
from transformers import VisionEncoderDecoderModel, TrOCRProcessor
import torch
print("--- Initializing Solver Service ---")
# Use a GPU if available (Hugging Face may provide one)
device = "cuda" if torch.cuda.is_available() else "cpu"
# --- LOAD MODELS ONLY ONCE AT STARTUP ---
print("1. Loading TrOCR processor...")
processor = TrOCRProcessor.from_pretrained("anuashok/ocr-captcha-v3", use_fast=True)
print(" - Processor loaded.")
print("2. Loading VisionEncoderDecoder model...")
model = VisionEncoderDecoderModel.from_pretrained("anuashok/ocr-captcha-v3").to(device)
print(" - Model loaded.")
print(f"--- Model is running on: {device.upper()} ---")
# --- END OF HEAVY LOADING ---
def solve_captcha(input_image: Image.Image) -> str:
"""
Solves a CAPTCHA using the pre-loaded model.
This function uses the exact image processing logic from your original script.
"""
print("--- Received image for solving ---")
# 1. Convert input image to RGBA (as in your original code)
image = input_image.convert("RGBA")
# 2. Prepare a white background
background = Image.new("RGBA", image.size, (255, 255, 255))
# 3. Composite the image onto the white background and convert to RGB
combined = Image.alpha_composite(background, image).convert("RGB")
print(" - Image pre-processing complete.")
# 4. Prepare image for the model
pixel_values = processor(images=combined, return_tensors="pt").pixel_values.to(device)
print(" - Image prepared for model.")
# 5. Run model inference
generated_ids = model.generate(pixel_values)
print(" - Model inference complete.")
# 6. Decode the result
generated_text = processor.batch_decode(generated_ids, skip_special_tokens=True)[0]
print(f" - Decoding complete. Result: {generated_text}")
return generated_text
# --- Create the Gradio Interface and API Endpoint ---
gr.Interface(
fn=solve_captcha,
inputs=gr.Image(type="pil", label="Upload CAPTCHA Image"),
outputs=gr.Textbox(label="Result"),
title="TrOCR CAPTCHA Solver (Custom Logic)",
description="An API for the anuashok/ocr-captcha-v3 model using specific pre-processing."
).launch()