import os import gradio as gr from huggingface_hub import hf_hub_download os.system("pip install llama-cpp-python") from llama_cpp import Llama # --- Configuration and Model Loading --- MODEL_REPO = "unsloth/gemma-3-270m-it-GGUF" QUANTIZED_FILENAME = "gemma-3-270m-it-Q4_K_M.gguf" bot_im = "https://huggingface.co/spaces/idzkha/Geo-Chat-Bert/resolve/main/bot.png" user_im = "https://huggingface.co/spaces/idzkha/Geo-Chat-Bert/resolve/main/user.png" try: print(f"Downloading model: {QUANTIZED_FILENAME} from {MODEL_REPO}...") model_path = hf_hub_download( repo_id=MODEL_REPO, filename=QUANTIZED_FILENAME ) print("Loading GGUF model for CPU inference...") llm = Llama( model_path=model_path, n_ctx=8192, n_gpu_layers=0, # Ensures CPU-only operation verbose=False ) print("Model loaded successfully.") except Exception as e: print(f"Error loading model: {e}") exit() # --- Gradio UI and Logic --- # This is the new, combined function that handles the entire chat process. def generate_chat_stream(user_message, history, system_prompt, max_new_tokens, temperature, top_p): """ A single generator function to handle streaming chat responses. """ # Add the user's message to the history, with an empty string as a placeholder for the bot's response. history.append([user_message, ""]) # Build the prompt for the model full_prompt = "" if system_prompt and system_prompt.strip(): full_prompt += f"System: {system_prompt}\n" # Pass the history *before* the current turn to the model for user_msg, model_msg in history[:-1]: full_prompt += f"User: {user_msg}\n" if model_msg is not None: full_prompt += f"Assistant: {model_msg}\n" full_prompt += f"User: {user_message}\nAssistant: " # Generate a streaming response from the model stream = llm( prompt=full_prompt, max_tokens=max_new_tokens, temperature=float(temperature), top_p=float(top_p), stream=True ) # Yield the history object at each step to update the UI for output in stream: # Append the new token to the bot's message placeholder history[-1][1] += output['choices'][0]['text'] # Yield the updated history and clear the user's textbox yield history, "" # Build the Gradio interface with gr.Blocks(theme=gr.themes.Soft()) as demo: gr.Markdown(f"""# Gradio Chat Demo (CPU Optimized with GGUF) ### Model: `{MODEL_REPO}` | Quantization: `{QUANTIZED_FILENAME}`""") chatbot = gr.Chatbot(label="Chat History", height=500, avatar_images=(user_im, bot_im)) msg = gr.Textbox( label="Your Message", placeholder="Type your message here and press Enter...", ) with gr.Accordion("Model Parameters", open=False): system_prompt = gr.Textbox(label="System Prompt", value="You are a helpful assistant.") max_new_tokens = gr.Slider(minimum=1, maximum=4096, value=1024, step=1, label="Max New Tokens") temperature = gr.Slider(minimum=0.1, maximum=1.0, value=0.7, step=0.05, label="Temperature") top_p = gr.Slider(minimum=0.1, maximum=1.0, value=0.95, step=0.05, label="Top-p (Nucleus Sampling)") clear = gr.Button("Clear Chat History") # Connect the new generator function to the submit event msg.submit( generate_chat_stream, [msg, chatbot, system_prompt, max_new_tokens, temperature, top_p], [chatbot, msg] ) clear.click(lambda: [], None, chatbot, queue=False) # Launch the demo demo.queue().launch(debug=True)