File size: 3,656 Bytes
b784cb6 56b53a3 8e9b886 5dbb124 8e9b886 b784cb6 a1a8a15 d55b16e bd49fea b784cb6 bd49fea b784cb6 bd49fea b784cb6 bd49fea b784cb6 bd49fea b784cb6 bd49fea b784cb6 bd49fea b784cb6 bd49fea b784cb6 bd49fea b784cb6 bd49fea b784cb6 bd49fea b784cb6 bd49fea b784cb6 bd49fea b784cb6 bd49fea b784cb6 bd49fea b784cb6 8e9b886 b784cb6 |
1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 |
import os
import gradio as gr
from huggingface_hub import hf_hub_download
os.system("pip install llama-cpp-python")
from llama_cpp import Llama
# --- Configuration and Model Loading ---
MODEL_REPO = "unsloth/gemma-3-270m-it-GGUF"
QUANTIZED_FILENAME = "gemma-3-270m-it-Q4_K_M.gguf"
bot_im = "https://huggingface.co/spaces/idzkha/Geo-Chat-Bert/resolve/main/bot.png"
user_im = "https://huggingface.co/spaces/idzkha/Geo-Chat-Bert/resolve/main/user.png"
try:
print(f"Downloading model: {QUANTIZED_FILENAME} from {MODEL_REPO}...")
model_path = hf_hub_download(
repo_id=MODEL_REPO,
filename=QUANTIZED_FILENAME
)
print("Loading GGUF model for CPU inference...")
llm = Llama(
model_path=model_path,
n_ctx=8192,
n_gpu_layers=0, # Ensures CPU-only operation
verbose=False
)
print("Model loaded successfully.")
except Exception as e:
print(f"Error loading model: {e}")
exit()
# --- Gradio UI and Logic ---
# This is the new, combined function that handles the entire chat process.
def generate_chat_stream(user_message, history, system_prompt, max_new_tokens, temperature, top_p):
"""
A single generator function to handle streaming chat responses.
"""
# Add the user's message to the history, with an empty string as a placeholder for the bot's response.
history.append([user_message, ""])
# Build the prompt for the model
full_prompt = ""
if system_prompt and system_prompt.strip():
full_prompt += f"System: {system_prompt}\n"
# Pass the history *before* the current turn to the model
for user_msg, model_msg in history[:-1]:
full_prompt += f"User: {user_msg}\n"
if model_msg is not None:
full_prompt += f"Assistant: {model_msg}\n"
full_prompt += f"User: {user_message}\nAssistant: "
# Generate a streaming response from the model
stream = llm(
prompt=full_prompt,
max_tokens=max_new_tokens,
temperature=float(temperature),
top_p=float(top_p),
stream=True
)
# Yield the history object at each step to update the UI
for output in stream:
# Append the new token to the bot's message placeholder
history[-1][1] += output['choices'][0]['text']
# Yield the updated history and clear the user's textbox
yield history, ""
# Build the Gradio interface
with gr.Blocks(theme=gr.themes.Soft()) as demo:
gr.Markdown(f"""# Gradio Chat Demo (CPU Optimized with GGUF)
### Model: `{MODEL_REPO}` | Quantization: `{QUANTIZED_FILENAME}`""")
chatbot = gr.Chatbot(label="Chat History", height=500, avatar_images=(user_im, bot_im))
msg = gr.Textbox(
label="Your Message",
placeholder="Type your message here and press Enter...",
)
with gr.Accordion("Model Parameters", open=False):
system_prompt = gr.Textbox(label="System Prompt", value="You are a helpful assistant.")
max_new_tokens = gr.Slider(minimum=1, maximum=4096, value=1024, step=1, label="Max New Tokens")
temperature = gr.Slider(minimum=0.1, maximum=1.0, value=0.7, step=0.05, label="Temperature")
top_p = gr.Slider(minimum=0.1, maximum=1.0, value=0.95, step=0.05, label="Top-p (Nucleus Sampling)")
clear = gr.Button("Clear Chat History")
# Connect the new generator function to the submit event
msg.submit(
generate_chat_stream,
[msg, chatbot, system_prompt, max_new_tokens, temperature, top_p],
[chatbot, msg]
)
clear.click(lambda: [], None, chatbot, queue=False)
# Launch the demo
demo.queue().launch(debug=True) |