# app.py

import gradio as gr
from tinyllava.model.builder import load_pretrained_model
from tinyllava.utils import disable_torch_init
from tinyllava.mm_utils import process_images, tokenizer_image_token, get_model_name_from_path
import torch
from PIL import Image

# --- Disable unnecessary torch init ---
disable_torch_init()

# --- Load TinyLLaVA 3.1B ---
model_path = "bczhou/TinyLLaVA-3.1B"  # official HF ID
tokenizer, model, image_processor, context_len = load_pretrained_model(
    model_path=model_path,
    model_base=None,  # If you have a base model, point it here; else leave as is
    model_name="TinyLLaVA-3.1B"
)

device = torch.device("cpu")
model.to(device)

# --- Gradio handler ---
def describe_image(image, prompt):
    # TinyLLaVA wants PIL
    image = Image.fromarray(image)
    image_tensor = process_images([image], image_processor, model.config)
    image_tensor = image_tensor.to(device)

    prompt = tokenizer_image_token(prompt, tokenizer, context_len)

    inputs = tokenizer([prompt])
    input_ids = torch.tensor(inputs.input_ids).unsqueeze(0).to(device)

    with torch.no_grad():
        output_ids = model.generate(
            input_ids,
            images=image_tensor,
            do_sample=True,
            temperature=0.2,
            max_new_tokens=200
        )

    out_text = tokenizer.decode(output_ids[0], skip_special_tokens=True)
    return out_text

iface = gr.Interface(
    fn=describe_image,
    inputs=[
        gr.Image(type="numpy", label="Image"),
        gr.Textbox(label="Your question", placeholder="What's happening in this image?")
    ],
    outputs=gr.Textbox(label="TinyLLaVA Answer"),
    title="🦙 TinyLLaVA-3.1B — Vision-Language Q&A",
    description="A lightweight LLaVA variant that runs on CPU Spaces. Upload an image, ask a question."
)

if __name__ == "__main__":
    iface.launch()