File size: 2,020 Bytes
6b5a511 |
1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 |
import torch
import gradio as gr
from transformers import AutoModelForCausalLM, AutoTokenizer, TextStreamer
import textwrap
model_id = "unsloth/gemma-3n-E2B-it-unsloth-bnb-4bit"
# Load tokenizer
tokenizer = AutoTokenizer.from_pretrained(model_id)
# Load model in full precision on CPU โ no bitsandbytes
model = AutoModelForCausalLM.from_pretrained(
model_id,
device_map="cpu", # Force CPU
torch_dtype=torch.float32, # Use FP32 to ensure CPU compatibility
)
model.eval()
# Helper to format response nicely
def print_response(text: str) -> str:
return "\n".join(textwrap.fill(line, 100) for line in text.split("\n"))
# Inference function for Gradio
def predict_text(system_prompt: str, user_prompt: str) -> str:
messages = [
{"role": "system", "content": [{"type": "text", "text": system_prompt.strip()}]},
{"role": "user", "content": [{"type": "text", "text": user_prompt.strip()}]},
]
inputs = tokenizer.apply_chat_template(
messages,
add_generation_prompt=True,
tokenize=True,
return_dict=True,
return_tensors="pt"
).to("cpu")
input_len = inputs["input_ids"].shape[-1]
with torch.inference_mode():
output = model.generate(
**inputs,
max_new_tokens=300,
do_sample=False,
use_cache=False # Important for CPU compatibility
)
generated = output[0][input_len:]
decoded = tokenizer.decode(generated, skip_special_tokens=True)
return print_response(decoded)
# Gradio UI
demo = gr.Interface(
fn=predict_text,
inputs=[
gr.Textbox(lines=2, label="System Prompt", value="You are a helpful assistant."),
gr.Textbox(lines=4, label="User Prompt", placeholder="Ask something..."),
],
outputs=gr.Textbox(label="Gemma 3n Response"),
title="Gemma 3n Chat (CPU-friendly)",
description="Lightweight CPU-only chatbot using a quantized Gemma 3n model.",
)
if __name__ == "__main__":
demo.launch()
|