# app.py import gradio as gr from tinyllava.model.builder import load_pretrained_model from tinyllava.utils import disable_torch_init from tinyllava.mm_utils import process_images, tokenizer_image_token, get_model_name_from_path import torch from PIL import Image # --- Disable unnecessary torch init --- disable_torch_init() # --- Load TinyLLaVA 3.1B --- model_path = "bczhou/TinyLLaVA-3.1B" # official HF ID tokenizer, model, image_processor, context_len = load_pretrained_model( model_path=model_path, model_base=None, # If you have a base model, point it here; else leave as is model_name="TinyLLaVA-3.1B" ) device = torch.device("cpu") model.to(device) # --- Gradio handler --- def describe_image(image, prompt): # TinyLLaVA wants PIL image = Image.fromarray(image) image_tensor = process_images([image], image_processor, model.config) image_tensor = image_tensor.to(device) prompt = tokenizer_image_token(prompt, tokenizer, context_len) inputs = tokenizer([prompt]) input_ids = torch.tensor(inputs.input_ids).unsqueeze(0).to(device) with torch.no_grad(): output_ids = model.generate( input_ids, images=image_tensor, do_sample=True, temperature=0.2, max_new_tokens=200 ) out_text = tokenizer.decode(output_ids[0], skip_special_tokens=True) return out_text iface = gr.Interface( fn=describe_image, inputs=[ gr.Image(type="numpy", label="Image"), gr.Textbox(label="Your question", placeholder="What's happening in this image?") ], outputs=gr.Textbox(label="TinyLLaVA Answer"), title="🦙 TinyLLaVA-3.1B — Vision-Language Q&A", description="A lightweight LLaVA variant that runs on CPU Spaces. Upload an image, ask a question." ) if __name__ == "__main__": iface.launch()