# app.py import gradio as gr from PIL import Image from transformers import VisionEncoderDecoderModel, TrOCRProcessor import torch print("--- Initializing Solver Service ---") # Use a GPU if available (Hugging Face may provide one) device = "cuda" if torch.cuda.is_available() else "cpu" # --- LOAD MODELS ONLY ONCE AT STARTUP --- print("1. Loading TrOCR processor...") processor = TrOCRProcessor.from_pretrained("anuashok/ocr-captcha-v3", use_fast=True) print(" - Processor loaded.") print("2. Loading VisionEncoderDecoder model...") model = VisionEncoderDecoderModel.from_pretrained("anuashok/ocr-captcha-v3").to(device) print(" - Model loaded.") print(f"--- Model is running on: {device.upper()} ---") # --- END OF HEAVY LOADING --- def solve_captcha(input_image: Image.Image) -> str: """ Solves a CAPTCHA using the pre-loaded model. This function uses the exact image processing logic from your original script. """ print("--- Received image for solving ---") # 1. Convert input image to RGBA (as in your original code) image = input_image.convert("RGBA") # 2. Prepare a white background background = Image.new("RGBA", image.size, (255, 255, 255)) # 3. Composite the image onto the white background and convert to RGB combined = Image.alpha_composite(background, image).convert("RGB") print(" - Image pre-processing complete.") # 4. Prepare image for the model pixel_values = processor(images=combined, return_tensors="pt").pixel_values.to(device) print(" - Image prepared for model.") # 5. Run model inference generated_ids = model.generate(pixel_values) print(" - Model inference complete.") # 6. Decode the result generated_text = processor.batch_decode(generated_ids, skip_special_tokens=True)[0] print(f" - Decoding complete. Result: {generated_text}") return generated_text # --- Create the Gradio Interface and API Endpoint --- gr.Interface( fn=solve_captcha, inputs=gr.Image(type="pil", label="Upload CAPTCHA Image"), outputs=gr.Textbox(label="Result"), title="TrOCR CAPTCHA Solver (Custom Logic)", description="An API for the anuashok/ocr-captcha-v3 model using specific pre-processing." ).launch()