File size: 4,994 Bytes
ab50a85 ce6d99a 28876d4 e29ec8b 2c5d86e ce6d99a 28876d4 db00cb1 28876d4 db00cb1 ce6d99a e6b4f06 2ef9adf ce6d99a db00cb1 2c5d86e db00cb1 2c5d86e 28876d4 db00cb1 28876d4 db00cb1 ce6d99a db00cb1 ce6d99a 28876d4 2c5d86e ce6d99a db00cb1 ce6d99a 2c5d86e ce6d99a 2c5d86e ce6d99a db00cb1 ce6d99a db00cb1 ce6d99a 2c5d86e ce6d99a 28876d4 29222bc 28876d4 ce6d99a db00cb1 ce6d99a 2c5d86e ce6d99a 2c5d86e ce6d99a 28876d4 52fab35 56ef105 28876d4 2ef9adf 28876d4 29222bc 012ceff 29222bc db00cb1 29222bc ce6d99a 28876d4 29222bc |
1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 |
import os
import gradio as gr
import torch
from huggingface_hub import login
from transformers import AutoTokenizer, AutoModelForCausalLM, TextIteratorStreamer
from threading import Thread
# --- Configuration and Model Loading ---
# Set your Hugging Face token (useful for Spaces)
HF_TOKEN = os.environ.get("HF_TOKEN")
if HF_TOKEN:
try:
login(token=HF_TOKEN)
print("Successfully logged in to Hugging Face Hub.")
except Exception as e:
print(f"Error logging in to Hugging Face Hub: {e}")
else:
print("HF_TOKEN not set, proceeding without login.")
MODEL_ID = "google/gemma-3-270m-it"
bot_im = "https://huggingface.co/spaces/idzkha/Geo-Chat-Bert/resolve/main/bot.png"
user_im = "https://huggingface.co/spaces/idzkha/Geo-Chat-Bert/resolve/main/user.png"
try:
print(f"Loading tokenizer: {MODEL_ID}...")
tokenizer = AutoTokenizer.from_pretrained(MODEL_ID)
print(f"Loading model: {MODEL_ID} for CPU...")
# Load the model with the default float32 precision
model = AutoModelForCausalLM.from_pretrained(
MODEL_ID,
# No need for device_map or specific dtypes for CPU
)
# --- CPU OPTIMIZATION ---
print("Optimizing model for CPU inference with quantization...")
# 1. Set the model to evaluation mode
model.eval()
# 2. Apply dynamic quantization to the linear layers
# This converts float32 weights to int8, making it much faster on CPU
model_quantized = torch.quantization.quantize_dynamic(
model, {torch.nn.Linear}, dtype=torch.qint8
)
print("Model successfully quantized and optimized for CPU.")
except Exception as e:
print(f"Error loading or quantizing model: {e}")
exit()
# --- Gradio UI and Logic ---
def generate_response(message, history, system_prompt, max_new_tokens, temperature, top_p):
"""
Generates a streaming response from the CPU-quantized model.
"""
conversation = []
if system_prompt and system_prompt.strip():
conversation.append({"role": "system", "content": system_prompt})
for user_msg, model_msg in history:
conversation.append({"role": "user", "content": user_msg})
if model_msg is not None:
conversation.append({"role": "assistant", "content": model_msg})
conversation.append({"role": "user", "content": message})
inputs = tokenizer.apply_chat_template(
conversation,
add_generation_prompt=True,
tokenize=True,
return_dict=True,
return_tensors="pt",
) # Inputs will be on CPU by default
streamer = TextIteratorStreamer(tokenizer, skip_prompt=True, skip_special_tokens=True)
generation_kwargs = {
**inputs,
"streamer": streamer,
"max_new_tokens": max_new_tokens,
"temperature": float(temperature),
"top_p": float(top_p),
"do_sample": True,
}
# Wrapper function to run generation within a no_grad context
def generation_thread_target(**kwargs):
# Use torch.no_grad() for inference to save memory and computations
with torch.no_grad():
model_quantized.generate(**kwargs)
# Run generation in a separate thread
thread = Thread(target=generation_thread_target, kwargs=generation_kwargs)
thread.start()
response = ""
for new_text in streamer:
response += new_text
yield response
# Build the Gradio interface
with gr.Blocks(theme=gr.themes.Soft()) as demo:
gr.Markdown(f"""# Gradio Chat Demo (CPU Optimized) with {MODEL_ID}
Duplicate this space for private CPU/GPU (faster)""")
chatbot = gr.Chatbot(label="Chat History", height=500, avatar_images=(user_im, bot_im))
msg = gr.Textbox(
label="Your Message",
placeholder="Type your message here and press Enter...",
)
with gr.Accordion("Model Parameters", open=False):
system_prompt = gr.Textbox(label="System Prompt", value="You are a helpful assistant.")
max_new_tokens = gr.Slider(minimum=1, maximum=32000, value=2048, step=1, label="Max New Tokens")
temperature = gr.Slider(minimum=0.1, maximum=1.0, value=0.7, step=0.05, label="Temperature")
top_p = gr.Slider(minimum=0.1, maximum=1.0, value=0.95, step=0.05, label="Top-p (Nucleus Sampling)")
clear = gr.Button("Clear Chat History")
def user_and_generate(user_message, history, system_prompt, max_new_tokens, temperature, top_p):
history.append([user_message, ""])
stream = generate_response(user_message, history[:-3], system_prompt, max_new_tokens, temperature, top_p)
for new_text in stream:
history[-1][1] = new_text
yield history, ""
msg.submit(
user_and_generate,
[msg, chatbot, system_prompt, max_new_tokens, temperature, top_p],
[chatbot, msg]
)
clear.click(lambda: [], None, chatbot, queue=False)
# Launch the demo
demo.queue().launch(debug=True, share=True) |