|
import os |
|
import gradio as gr |
|
from huggingface_hub import hf_hub_download |
|
os.system("pip install llama-cpp-python") |
|
from llama_cpp import Llama |
|
|
|
|
|
MODEL_REPO = "unsloth/gemma-3-270m-it-GGUF" |
|
QUANTIZED_FILENAME = "gemma-3-270m-it-Q4_K_M.gguf" |
|
|
|
bot_im = "https://huggingface.co/spaces/idzkha/Geo-Chat-Bert/resolve/main/bot.png" |
|
user_im = "https://huggingface.co/spaces/idzkha/Geo-Chat-Bert/resolve/main/user.png" |
|
|
|
try: |
|
print(f"Downloading model: {QUANTIZED_FILENAME} from {MODEL_REPO}...") |
|
model_path = hf_hub_download( |
|
repo_id=MODEL_REPO, |
|
filename=QUANTIZED_FILENAME |
|
) |
|
|
|
print("Loading GGUF model for CPU inference...") |
|
llm = Llama( |
|
model_path=model_path, |
|
n_ctx=8192, |
|
n_gpu_layers=0, |
|
verbose=False |
|
) |
|
print("Model loaded successfully.") |
|
|
|
except Exception as e: |
|
print(f"Error loading model: {e}") |
|
exit() |
|
|
|
|
|
|
|
|
|
def generate_chat_stream(user_message, history, system_prompt, max_new_tokens, temperature, top_p): |
|
""" |
|
A single generator function to handle streaming chat responses. |
|
""" |
|
|
|
history.append([user_message, ""]) |
|
|
|
|
|
full_prompt = "" |
|
if system_prompt and system_prompt.strip(): |
|
full_prompt += f"System: {system_prompt}\n" |
|
|
|
|
|
for user_msg, model_msg in history[:-1]: |
|
full_prompt += f"User: {user_msg}\n" |
|
if model_msg is not None: |
|
full_prompt += f"Assistant: {model_msg}\n" |
|
|
|
full_prompt += f"User: {user_message}\nAssistant: " |
|
|
|
|
|
stream = llm( |
|
prompt=full_prompt, |
|
max_tokens=max_new_tokens, |
|
temperature=float(temperature), |
|
top_p=float(top_p), |
|
stream=True |
|
) |
|
|
|
|
|
for output in stream: |
|
|
|
history[-1][1] += output['choices'][0]['text'] |
|
|
|
yield history, "" |
|
|
|
|
|
with gr.Blocks(theme=gr.themes.Soft()) as demo: |
|
gr.Markdown(f"""# Gradio Chat Demo (CPU Optimized with GGUF) |
|
### Model: `{MODEL_REPO}` | Quantization: `{QUANTIZED_FILENAME}`""") |
|
|
|
chatbot = gr.Chatbot(label="Chat History", height=500, avatar_images=(user_im, bot_im)) |
|
|
|
msg = gr.Textbox( |
|
label="Your Message", |
|
placeholder="Type your message here and press Enter...", |
|
) |
|
|
|
with gr.Accordion("Model Parameters", open=False): |
|
system_prompt = gr.Textbox(label="System Prompt", value="You are a helpful assistant.") |
|
max_new_tokens = gr.Slider(minimum=1, maximum=4096, value=1024, step=1, label="Max New Tokens") |
|
temperature = gr.Slider(minimum=0.1, maximum=1.0, value=0.7, step=0.05, label="Temperature") |
|
top_p = gr.Slider(minimum=0.1, maximum=1.0, value=0.95, step=0.05, label="Top-p (Nucleus Sampling)") |
|
|
|
clear = gr.Button("Clear Chat History") |
|
|
|
|
|
msg.submit( |
|
generate_chat_stream, |
|
[msg, chatbot, system_prompt, max_new_tokens, temperature, top_p], |
|
[chatbot, msg] |
|
) |
|
|
|
clear.click(lambda: [], None, chatbot, queue=False) |
|
|
|
|
|
demo.queue().launch(debug=True) |