import gradio as gr
from transformers import AutoTokenizer, AutoModelForCausalLM
import torch
import spaces

model_id = "microsoft/phi-2"
device = "cuda"  # ZeroGPU space
precision = torch.float16

# Load tokenizer & model
tokenizer = AutoTokenizer.from_pretrained(model_id)
model = AutoModelForCausalLM.from_pretrained(
    model_id, torch_dtype=precision
).to(device)

@spaces.GPU
def respond(message, history):
    # history comes in as a list of (user, bot) tuples
    history = history or []
    prompt = ""
    for u, b in history:
        prompt += f"User: {u}\nAssistant: {b}\n"
    prompt += f"User: {message}\nAssistant:"

    inputs = tokenizer(prompt, return_tensors="pt").to(device)
    outputs = model.generate(
        **inputs,
        max_new_tokens=256,
        do_sample=True,
        temperature=0.7,
        top_p=0.95,
        eos_token_id=tokenizer.eos_token_id,
    )
    decoded = tokenizer.decode(outputs[0], skip_special_tokens=True)
    response = decoded[len(prompt):].strip().split("\n")[0]

    # append to history as a tuple of two plain strings
    history.append([message, response])
    # return "" to clear the textbox, and the updated history for the Chatbot
    return "", history

with gr.Blocks() as demo:
    gr.Markdown("## Phi-2 Chatbot (ZeroGPU‑safe)")
    chatbot = gr.Chatbot()
    msg     = gr.Textbox(label="Your message")
    clear   = gr.Button("Clear")

    # on submit: feed (msg, history) → (clear box, updated history)
    msg.submit(respond, inputs=[msg, chatbot], outputs=[msg, chatbot])
    clear.click(lambda: None, None, chatbot, queue=False)

demo.launch(share=True)