|
import gradio as gr |
|
from transformers import Pix2StructProcessor, Pix2StructForConditionalGeneration |
|
from PIL import Image |
|
|
|
|
|
model = Pix2StructForConditionalGeneration.from_pretrained("google/pix2struct-screen2words-large") |
|
processor = Pix2StructProcessor.from_pretrained("google/pix2struct-screen2words-large") |
|
|
|
|
|
def describe_ui(image): |
|
inputs = processor(images=image, return_tensors="pt") |
|
outputs = model.generate(**inputs) |
|
return processor.decode(outputs[0], skip_special_tokens=True) |
|
|
|
|
|
gr.Interface( |
|
fn=describe_ui, |
|
inputs=gr.Image(type="pil"), |
|
outputs="text", |
|
title="UI Screen Describer (Pix2Struct)", |
|
description="Upload a screenshot or UI image and get an automatic description powered by Google’s Pix2Struct model." |
|
).launch() |
|
|