Spaces:

docilio
/

3nhance

Sleeping

3nhance / app.py

Tiago Caldeira

different approach using unsloth model

6b5a511 4 days ago

1.68 kB

	import torch
	import gradio as gr
	from transformers import AutoModelForCausalLM, AutoTokenizer

	model_id = "unsloth/gemma-3n-E2B-it-unsloth-bnb-4bit"

	# Load tokenizer
	tokenizer = AutoTokenizer.from_pretrained(model_id)

	# Load model on CPU
	model = AutoModelForCausalLM.from_pretrained(
	model_id,
	torch_dtype=torch.float32,
	)
	model.to("cpu")
	model.eval()

	# Minimal generation: single user prompt, static system
	def generate_response(user_prompt):
	messages = [
	{
	"role": "system",
	"content": [{"type": "text", "text": "You are a helpful assistant."}]
	},
	{
	"role": "user",
	"content": [{"type": "text", "text": user_prompt.strip()}]
	}
	]

	inputs = tokenizer.apply_chat_template(
	messages,
	add_generation_prompt=True,
	return_tensors="pt"
	).to("cpu")

	input_len = inputs["input_ids"].shape[-1]

	with torch.inference_mode():
	outputs = model.generate(
	input_ids=inputs["input_ids"],
	attention_mask=inputs["attention_mask"],
	max_new_tokens=100,
	do_sample=False,
	use_cache=False
	)

	generated_tokens = outputs[0][input_len:]
	decoded = tokenizer.decode(generated_tokens, skip_special_tokens=True)
	return decoded.strip()

	# Gradio UI
	demo = gr.Interface(
	fn=generate_response,
	inputs=gr.Textbox(lines=3, label="Enter your question"),
	outputs=gr.Textbox(label="Gemma 3n Response"),
	title="🧪 Simple Gemma 3n Demo (CPU)",
	description="Test the Gemma 3n model with minimal output. Max 100 tokens.",
	)

	if __name__ == "__main__":
	demo.launch()