import gradio as gr from transformers import AutoTokenizer, AutoModelForCausalLM import torch import spaces model_id = "microsoft/phi-2" device = "cuda" # ZeroGPU space precision = torch.float16 # Load tokenizer & model tokenizer = AutoTokenizer.from_pretrained(model_id) model = AutoModelForCausalLM.from_pretrained( model_id, torch_dtype=precision ).to(device) @spaces.GPU def respond(message, history): # history comes in as a list of (user, bot) tuples history = history or [] prompt = "" for u, b in history: prompt += f"User: {u}\nAssistant: {b}\n" prompt += f"User: {message}\nAssistant:" inputs = tokenizer(prompt, return_tensors="pt").to(device) outputs = model.generate( **inputs, max_new_tokens=256, do_sample=True, temperature=0.7, top_p=0.95, eos_token_id=tokenizer.eos_token_id, ) decoded = tokenizer.decode(outputs[0], skip_special_tokens=True) response = decoded[len(prompt):].strip().split("\n")[0] # append to history as a tuple of two plain strings history.append([message, response]) # return "" to clear the textbox, and the updated history for the Chatbot return "", history with gr.Blocks() as demo: gr.Markdown("## Phi-2 Chatbot (ZeroGPU‑safe)") chatbot = gr.Chatbot() msg = gr.Textbox(label="Your message") clear = gr.Button("Clear") # on submit: feed (msg, history) → (clear box, updated history) msg.submit(respond, inputs=[msg, chatbot], outputs=[msg, chatbot]) clear.click(lambda: None, None, chatbot, queue=False) demo.launch(share=True)