File size: 3,656 Bytes
b784cb6
56b53a3
8e9b886
5dbb124
8e9b886
b784cb6
 
a1a8a15
d55b16e
bd49fea
b784cb6
 
 
 
 
 
 
 
 
 
 
 
 
bd49fea
 
 
b784cb6
 
 
 
 
 
 
 
 
bd49fea
 
b784cb6
bd49fea
b784cb6
bd49fea
 
 
 
 
b784cb6
bd49fea
 
 
 
b784cb6
 
 
 
bd49fea
b784cb6
bd49fea
b784cb6
 
 
 
 
bd49fea
b784cb6
 
bd49fea
b784cb6
bd49fea
 
 
 
b784cb6
 
 
bd49fea
 
b784cb6
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
bd49fea
b784cb6
bd49fea
b784cb6
bd49fea
b784cb6
 
 
8e9b886
b784cb6
 
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
import os
import gradio as gr
from huggingface_hub import hf_hub_download
os.system("pip install llama-cpp-python")
from llama_cpp import Llama

# --- Configuration and Model Loading ---
MODEL_REPO = "unsloth/gemma-3-270m-it-GGUF"
QUANTIZED_FILENAME = "gemma-3-270m-it-Q4_K_M.gguf"

bot_im = "https://huggingface.co/spaces/idzkha/Geo-Chat-Bert/resolve/main/bot.png"
user_im = "https://huggingface.co/spaces/idzkha/Geo-Chat-Bert/resolve/main/user.png"

try:
    print(f"Downloading model: {QUANTIZED_FILENAME} from {MODEL_REPO}...")
    model_path = hf_hub_download(
        repo_id=MODEL_REPO,
        filename=QUANTIZED_FILENAME
    )

    print("Loading GGUF model for CPU inference...")
    llm = Llama(
        model_path=model_path,
        n_ctx=8192,
        n_gpu_layers=0, # Ensures CPU-only operation
        verbose=False
    )
    print("Model loaded successfully.")

except Exception as e:
    print(f"Error loading model: {e}")
    exit()

# --- Gradio UI and Logic ---

# This is the new, combined function that handles the entire chat process.
def generate_chat_stream(user_message, history, system_prompt, max_new_tokens, temperature, top_p):
    """
    A single generator function to handle streaming chat responses.
    """
    # Add the user's message to the history, with an empty string as a placeholder for the bot's response.
    history.append([user_message, ""])

    # Build the prompt for the model
    full_prompt = ""
    if system_prompt and system_prompt.strip():
        full_prompt += f"System: {system_prompt}\n"

    # Pass the history *before* the current turn to the model
    for user_msg, model_msg in history[:-1]:
        full_prompt += f"User: {user_msg}\n"
        if model_msg is not None:
            full_prompt += f"Assistant: {model_msg}\n"
    
    full_prompt += f"User: {user_message}\nAssistant: "

    # Generate a streaming response from the model
    stream = llm(
        prompt=full_prompt,
        max_tokens=max_new_tokens,
        temperature=float(temperature),
        top_p=float(top_p),
        stream=True
    )

    # Yield the history object at each step to update the UI
    for output in stream:
        # Append the new token to the bot's message placeholder
        history[-1][1] += output['choices'][0]['text']
        # Yield the updated history and clear the user's textbox
        yield history, ""

# Build the Gradio interface
with gr.Blocks(theme=gr.themes.Soft()) as demo:
    gr.Markdown(f"""# Gradio Chat Demo (CPU Optimized with GGUF)
    ### Model: `{MODEL_REPO}` | Quantization: `{QUANTIZED_FILENAME}`""")
    
    chatbot = gr.Chatbot(label="Chat History", height=500, avatar_images=(user_im, bot_im))
    
    msg = gr.Textbox(
        label="Your Message",
        placeholder="Type your message here and press Enter...",
    )
        
    with gr.Accordion("Model Parameters", open=False):
        system_prompt = gr.Textbox(label="System Prompt", value="You are a helpful assistant.")
        max_new_tokens = gr.Slider(minimum=1, maximum=4096, value=1024, step=1, label="Max New Tokens")
        temperature = gr.Slider(minimum=0.1, maximum=1.0, value=0.7, step=0.05, label="Temperature")
        top_p = gr.Slider(minimum=0.1, maximum=1.0, value=0.95, step=0.05, label="Top-p (Nucleus Sampling)")

    clear = gr.Button("Clear Chat History")

    # Connect the new generator function to the submit event
    msg.submit(
        generate_chat_stream,
        [msg, chatbot, system_prompt, max_new_tokens, temperature, top_p],
        [chatbot, msg]
    )

    clear.click(lambda: [], None, chatbot, queue=False)

# Launch the demo
demo.queue().launch(debug=True)