Spaces:

broadfield-dev
/

gemma-3-270m-it-unsloth-demo

Running

App Files Files Community

gemma-3-270m-it-unsloth-demo / app.py

broadfield-dev

Update app.py

d55b16e verified 9 days ago

raw

history blame contribute delete

3.66 kB

	import os
	import gradio as gr
	from huggingface_hub import hf_hub_download
	os.system("pip install llama-cpp-python")
	from llama_cpp import Llama

	# --- Configuration and Model Loading ---
	MODEL_REPO = "unsloth/gemma-3-270m-it-GGUF"
	QUANTIZED_FILENAME = "gemma-3-270m-it-Q4_K_M.gguf"

	bot_im = "https://huggingface.co/spaces/idzkha/Geo-Chat-Bert/resolve/main/bot.png"
	user_im = "https://huggingface.co/spaces/idzkha/Geo-Chat-Bert/resolve/main/user.png"

	try:
	print(f"Downloading model: {QUANTIZED_FILENAME} from {MODEL_REPO}...")
	model_path = hf_hub_download(
	repo_id=MODEL_REPO,
	filename=QUANTIZED_FILENAME
	)

	print("Loading GGUF model for CPU inference...")
	llm = Llama(
	model_path=model_path,
	n_ctx=8192,
	n_gpu_layers=0, # Ensures CPU-only operation
	verbose=False
	)
	print("Model loaded successfully.")

	except Exception as e:
	print(f"Error loading model: {e}")
	exit()

	# --- Gradio UI and Logic ---

	# This is the new, combined function that handles the entire chat process.
	def generate_chat_stream(user_message, history, system_prompt, max_new_tokens, temperature, top_p):
	"""
	A single generator function to handle streaming chat responses.
	"""
	# Add the user's message to the history, with an empty string as a placeholder for the bot's response.
	history.append([user_message, ""])

	# Build the prompt for the model
	full_prompt = ""
	if system_prompt and system_prompt.strip():
	full_prompt += f"System: {system_prompt}\n"

	# Pass the history before the current turn to the model
	for user_msg, model_msg in history[:-1]:
	full_prompt += f"User: {user_msg}\n"
	if model_msg is not None:
	full_prompt += f"Assistant: {model_msg}\n"

	full_prompt += f"User: {user_message}\nAssistant: "

	# Generate a streaming response from the model
	stream = llm(
	prompt=full_prompt,
	max_tokens=max_new_tokens,
	temperature=float(temperature),
	top_p=float(top_p),
	stream=True
	)

	# Yield the history object at each step to update the UI
	for output in stream:
	# Append the new token to the bot's message placeholder
	history[-1][1] += output['choices'][0]['text']
	# Yield the updated history and clear the user's textbox
	yield history, ""

	# Build the Gradio interface
	with gr.Blocks(theme=gr.themes.Soft()) as demo:
	gr.Markdown(f"""# Gradio Chat Demo (CPU Optimized with GGUF)
	### Model: `{MODEL_REPO}` \| Quantization: `{QUANTIZED_FILENAME}`""")

	chatbot = gr.Chatbot(label="Chat History", height=500, avatar_images=(user_im, bot_im))

	msg = gr.Textbox(
	label="Your Message",
	placeholder="Type your message here and press Enter...",
	)

	with gr.Accordion("Model Parameters", open=False):
	system_prompt = gr.Textbox(label="System Prompt", value="You are a helpful assistant.")
	max_new_tokens = gr.Slider(minimum=1, maximum=4096, value=1024, step=1, label="Max New Tokens")
	temperature = gr.Slider(minimum=0.1, maximum=1.0, value=0.7, step=0.05, label="Temperature")
	top_p = gr.Slider(minimum=0.1, maximum=1.0, value=0.95, step=0.05, label="Top-p (Nucleus Sampling)")

	clear = gr.Button("Clear Chat History")

	# Connect the new generator function to the submit event
	msg.submit(
	generate_chat_stream,
	[msg, chatbot, system_prompt, max_new_tokens, temperature, top_p],
	[chatbot, msg]
	)

	clear.click(lambda: [], None, chatbot, queue=False)

	# Launch the demo
	demo.queue().launch(debug=True)