Spaces:

AxleToe
/

captcha-solving

Running

App Files Files Community

captcha-solving / app.py

AxleToe

Create app.py

066a23d verified 21 days ago

raw

history blame contribute delete

2.26 kB

	# app.py

	import gradio as gr
	from PIL import Image
	from transformers import VisionEncoderDecoderModel, TrOCRProcessor
	import torch

	print("--- Initializing Solver Service ---")

	# Use a GPU if available (Hugging Face may provide one)
	device = "cuda" if torch.cuda.is_available() else "cpu"

	# --- LOAD MODELS ONLY ONCE AT STARTUP ---
	print("1. Loading TrOCR processor...")
	processor = TrOCRProcessor.from_pretrained("anuashok/ocr-captcha-v3", use_fast=True)
	print(" - Processor loaded.")

	print("2. Loading VisionEncoderDecoder model...")
	model = VisionEncoderDecoderModel.from_pretrained("anuashok/ocr-captcha-v3").to(device)
	print(" - Model loaded.")
	print(f"--- Model is running on: {device.upper()} ---")
	# --- END OF HEAVY LOADING ---


	def solve_captcha(input_image: Image.Image) -> str:
	"""
	Solves a CAPTCHA using the pre-loaded model.
	This function uses the exact image processing logic from your original script.
	"""
	print("--- Received image for solving ---")

	# 1. Convert input image to RGBA (as in your original code)
	image = input_image.convert("RGBA")

	# 2. Prepare a white background
	background = Image.new("RGBA", image.size, (255, 255, 255))

	# 3. Composite the image onto the white background and convert to RGB
	combined = Image.alpha_composite(background, image).convert("RGB")
	print(" - Image pre-processing complete.")

	# 4. Prepare image for the model
	pixel_values = processor(images=combined, return_tensors="pt").pixel_values.to(device)
	print(" - Image prepared for model.")

	# 5. Run model inference
	generated_ids = model.generate(pixel_values)
	print(" - Model inference complete.")

	# 6. Decode the result
	generated_text = processor.batch_decode(generated_ids, skip_special_tokens=True)[0]
	print(f" - Decoding complete. Result: {generated_text}")

	return generated_text


	# --- Create the Gradio Interface and API Endpoint ---
	gr.Interface(
	fn=solve_captcha,
	inputs=gr.Image(type="pil", label="Upload CAPTCHA Image"),
	outputs=gr.Textbox(label="Result"),
	title="TrOCR CAPTCHA Solver (Custom Logic)",
	description="An API for the anuashok/ocr-captcha-v3 model using specific pre-processing."
	).launch()