broadfield-dev's picture
Update app.py
d55b16e verified
import os
import gradio as gr
from huggingface_hub import hf_hub_download
os.system("pip install llama-cpp-python")
from llama_cpp import Llama
# --- Configuration and Model Loading ---
MODEL_REPO = "unsloth/gemma-3-270m-it-GGUF"
QUANTIZED_FILENAME = "gemma-3-270m-it-Q4_K_M.gguf"
bot_im = "https://huggingface.co/spaces/idzkha/Geo-Chat-Bert/resolve/main/bot.png"
user_im = "https://huggingface.co/spaces/idzkha/Geo-Chat-Bert/resolve/main/user.png"
try:
print(f"Downloading model: {QUANTIZED_FILENAME} from {MODEL_REPO}...")
model_path = hf_hub_download(
repo_id=MODEL_REPO,
filename=QUANTIZED_FILENAME
)
print("Loading GGUF model for CPU inference...")
llm = Llama(
model_path=model_path,
n_ctx=8192,
n_gpu_layers=0, # Ensures CPU-only operation
verbose=False
)
print("Model loaded successfully.")
except Exception as e:
print(f"Error loading model: {e}")
exit()
# --- Gradio UI and Logic ---
# This is the new, combined function that handles the entire chat process.
def generate_chat_stream(user_message, history, system_prompt, max_new_tokens, temperature, top_p):
"""
A single generator function to handle streaming chat responses.
"""
# Add the user's message to the history, with an empty string as a placeholder for the bot's response.
history.append([user_message, ""])
# Build the prompt for the model
full_prompt = ""
if system_prompt and system_prompt.strip():
full_prompt += f"System: {system_prompt}\n"
# Pass the history *before* the current turn to the model
for user_msg, model_msg in history[:-1]:
full_prompt += f"User: {user_msg}\n"
if model_msg is not None:
full_prompt += f"Assistant: {model_msg}\n"
full_prompt += f"User: {user_message}\nAssistant: "
# Generate a streaming response from the model
stream = llm(
prompt=full_prompt,
max_tokens=max_new_tokens,
temperature=float(temperature),
top_p=float(top_p),
stream=True
)
# Yield the history object at each step to update the UI
for output in stream:
# Append the new token to the bot's message placeholder
history[-1][1] += output['choices'][0]['text']
# Yield the updated history and clear the user's textbox
yield history, ""
# Build the Gradio interface
with gr.Blocks(theme=gr.themes.Soft()) as demo:
gr.Markdown(f"""# Gradio Chat Demo (CPU Optimized with GGUF)
### Model: `{MODEL_REPO}` | Quantization: `{QUANTIZED_FILENAME}`""")
chatbot = gr.Chatbot(label="Chat History", height=500, avatar_images=(user_im, bot_im))
msg = gr.Textbox(
label="Your Message",
placeholder="Type your message here and press Enter...",
)
with gr.Accordion("Model Parameters", open=False):
system_prompt = gr.Textbox(label="System Prompt", value="You are a helpful assistant.")
max_new_tokens = gr.Slider(minimum=1, maximum=4096, value=1024, step=1, label="Max New Tokens")
temperature = gr.Slider(minimum=0.1, maximum=1.0, value=0.7, step=0.05, label="Temperature")
top_p = gr.Slider(minimum=0.1, maximum=1.0, value=0.95, step=0.05, label="Top-p (Nucleus Sampling)")
clear = gr.Button("Clear Chat History")
# Connect the new generator function to the submit event
msg.submit(
generate_chat_stream,
[msg, chatbot, system_prompt, max_new_tokens, temperature, top_p],
[chatbot, msg]
)
clear.click(lambda: [], None, chatbot, queue=False)
# Launch the demo
demo.queue().launch(debug=True)