import gradio as gr from transformers import pipeline, CLIPProcessor, CLIPModel from PIL import Image import torch classifier = pipeline("image-classification", model="Skorm/food11-vit") # Load CLIP model clip_model = CLIPModel.from_pretrained("openai/clip-vit-large-patch14") clip_processor = CLIPProcessor.from_pretrained("openai/clip-vit-large-patch14") # Define CLIP labels clip_labels = [ "Bread", "Dairy product", "Dessert", "Egg", "Fried food", "Meat", "Noodles-Pasta", "Rice", "Seafood", "Soup", "Vegetable-Fruit" ] def classify_food(image_path): image = Image.open(image_path) # ----- ViT prediction ----- vit_results = classifier(image_path) vit_output = {result["label"]: round(result["score"], 4) for result in vit_results} # ----- CLIP zero-shot prediction ----- inputs = clip_processor(text=clip_labels, images=image, return_tensors="pt", padding=True) outputs = clip_model(**inputs) probs = outputs.logits_per_image.softmax(dim=1)[0] clip_output = {label: round(float(score), 4) for label, score in zip(clip_labels, probs)} return vit_output, clip_output # Example image paths examples = [ ["example_images/bread.jpg"], ["example_images/dessert.jpg"], ["example_images/fruits.jpg"], ["example_images/noodles.jpeg"], ["example_images/ramen.jpg"], ["example_images/seafood.jpg"], ] # Gradio interface iface = gr.Interface( fn=classify_food, inputs=gr.Image(type="filepath"), outputs=[ gr.Label(num_top_classes=3, label="ViT (Fine-tuned) Prediction"), gr.Label(num_top_classes=3, label="CLIP Zero-Shot Prediction") ], title="🍽️ Food Classification with ViT and Zero-Shot CLIP", description="Upload a food image. The app compares predictions between your fine-tuned ViT model and zero-shot CLIP.", examples=examples ) iface.launch()