Spaces:
Running
on
CPU Upgrade
Running
on
CPU Upgrade
import gc | |
import gradio as gr | |
# import torch | |
# from transformers import AutoTokenizer, AutoModelForCausalLM #, HqqConfig | |
# # # quant_config = HqqConfig(nbits=8, group_size=64) | |
# MODEL_ID = "HuggingFaceTB/SmolLM3-3B" | |
# DEVICE = "cuda" if torch.cuda.is_available() else "cpu" | |
# print("Loading tokenizer & model…") | |
# tokenizer = AutoTokenizer.from_pretrained(MODEL_ID) | |
# # # model = AutoModelForCausalLM.from_pretrained(MODEL_ID, torch_dtype=torch.bfloat16).to(DEVICE) | |
# model =\ | |
# AutoModelForCausalLM\ | |
# .from_pretrained( | |
# MODEL_ID, | |
# torch_dtype=torch.float16, | |
# # device_map="cuda", | |
# # quantization_config=quant_config | |
# ).to(DEVICE) | |
# gc.collect() | |
######### | |
import torch | |
from transformers import TorchAoConfig, AutoModelForCausalLM, AutoTokenizer | |
from torchao.quantization import Float8DynamicActivationFloat8WeightConfig, Float8WeightOnlyConfig | |
# quant_config = Float8WeightOnlyConfig() | |
quant_config = Float8DynamicActivationFloat8WeightConfig() | |
quantization_config = TorchAoConfig(quant_type=quant_config) | |
MODEL_ID = "HuggingFaceTB/SmolLM3-3B" | |
tokenizer = AutoTokenizer.from_pretrained(MODEL_ID) | |
model = AutoModelForCausalLM.from_pretrained( | |
MODEL_ID, | |
torch_dtype="auto", | |
device_map="auto", | |
quantization_config=quantization_config) | |
gc.collect() | |
######### | |
# from unsloth import FastLanguageModel | |
# model, tokenizer = FastLanguageModel.from_pretrained( | |
# "unsloth/Llama-3.2-3B-Instruct-bnb-4bit", | |
# max_seq_length=128_000, | |
# load_in_4bit=True | |
# ) | |
######### | |
# import gc | |
# import gradio as gr | |
# from transformers import AutoTokenizer | |
# from optimum.onnxruntime import ORTModelForCausalLM, ORTQuantizer | |
# from optimum.onnxruntime.configuration import AutoQuantizationConfig | |
# MODEL_NAME = "HuggingFaceTB/SmolLM3-3B" | |
# tokenizer = AutoTokenizer.from_pretrained(MODEL_NAME) | |
# model = ORTModelForCausalLM.from_pretrained(MODEL_NAME, export=True) | |
# print("Creating quant config") | |
# qconfig = AutoQuantizationConfig.avx512_vnni(is_static=False, per_channel=True) | |
# print("Creating quant config successful") | |
# print("Creating quantizer") | |
# quantizer = ORTQuantizer.from_pretrained(model) | |
# print("Creating quantizer successful") | |
# # Step 4: Perform quantization saving output in a new directory | |
# quantized_model_dir = "./quantized_model" | |
# print("Starting quantization...") | |
# quantizer.quantize(save_dir=quantized_model_dir, quantization_config=qconfig) | |
# print("Quantization was successful. Garbage collecting...") | |
# del(quantizer) | |
# del(qconfig) | |
# del(model) | |
# Run garbage collection again to release memory from quantizer objects | |
gc.collect() | |
# # Step 5: Load the quantized ONNX model for inference | |
# print("Loading quantized ONNX model for inference...") | |
# model = ORTModelForCausalLM.from_pretrained(quantized_model_dir) | |
# print("Loading model was succcessful. Garbage collecting.") | |
# Garbage collection again after final loading | |
gc.collect() | |
######### | |
# print("Loading tokenizer & model…") | |
# import gradio as gr | |
# from transformers import AutoTokenizer | |
# from optimum.onnxruntime import ORTModelForCausalLM | |
# MODEL_ID = "HuggingFaceTB/SmolLM3-3B" | |
# tokenizer = AutoTokenizer.from_pretrained(MODEL_ID) | |
# model = ORTModelForCausalLM.from_pretrained(MODEL_ID, export=True, quantize=True) | |
######### | |
# ------------------------------------------------- | |
# Optional tool(s) | |
# ------------------------------------------------- | |
# TOOLS = [{ | |
# "name": "get_weather", | |
# "description": "Get the current weather in a given city", | |
# "parameters": { | |
# "type": "object", | |
# "properties": { | |
# "city": {"type": "string", "description": "City name"} | |
# }, | |
# "required": ["city"] | |
# } | |
# }] | |
# ------------------------------------------------- | |
# Helpers | |
# ------------------------------------------------- | |
def build_messages(history, enable_thinking: bool): | |
"""Convert Gradio history to the chat template.""" | |
messages = [] | |
for h in history: | |
messages.append({"role": h["role"], "content": h["content"]}) | |
# Add system instruction for mode | |
system_flag = "/think" if enable_thinking else "/no_think" | |
messages.insert(0, {"role": "system", "content": system_flag}) | |
return messages | |
def chat_fn(history, enable_thinking, temperature, top_p, top_k, repetition_penalty, max_new_tokens): | |
"""Generate a streaming response.""" | |
messages = build_messages(history, enable_thinking) | |
text = tokenizer.apply_chat_template( | |
messages, | |
tokenize=False, | |
add_generation_prompt=True, | |
# xml_tools=TOOLS | |
) | |
inputs = tokenizer(text, return_tensors="pt") | |
gc.collect() | |
with torch.inference_mode(): | |
streamer = model.generate( | |
**inputs, | |
max_new_tokens=max_new_tokens, | |
do_sample=True, | |
temperature=temperature, | |
top_p=top_p, | |
top_k=top_k, | |
repetition_penalty=repetition_penalty, | |
pad_token_id=tokenizer.eos_token_id, | |
streamer=None # we'll yield manually | |
) | |
gc.collect() | |
output_ids = streamer[0][len(inputs.input_ids[0]):] | |
response = tokenizer.decode(output_ids, skip_special_tokens=True) | |
if isinstance(response, str): | |
response = response.replace('<think>',"# <think>").replace('</think>',"</think>") | |
elif isinstance(response,list): | |
response = [paper.replace('<think>',"# <think>").replace('</think>',"</think>") for paper in response] | |
else: | |
raise ValueError("Tokenizer response seems malformed; Not a string, nor a list?!?!") | |
# streaming char-by-char | |
history.append({"role": "assistant", "content": ""}) | |
for ch in response: | |
history[-1]["content"] += ch | |
yield history | |
# ------------------------------------------------- | |
# Blocks UI | |
# ------------------------------------------------- | |
with gr.Blocks(title="SmolLM3-3B Chat") as demo: | |
gr.Markdown("## 🤖 SmolLM3-3B Chatbot (Streaming)") | |
with gr.Row(): | |
enable_think = gr.Checkbox(label="Enable Extended Thinking (/think)", value=True) | |
temperature = gr.Slider(0.0, 1.0, value=0.6, label="Temperature") | |
top_p = gr.Slider(0.0, 1.0, value=0.95, label="Top-p") | |
top_k = gr.Slider(1,40,value=20,label="Top_k") | |
repetition_penalty = gr.Slider(1.0,1.4,value=1.1,label="Repetition_Penalty") | |
max_new_tokens = gr.Slider(1000,32768,value=32768,label="Max_New_Tokens") | |
chatbot = gr.Chatbot(type="messages") | |
msg = gr.Textbox(placeholder="Type your message here…", lines=1) | |
clear = gr.Button("Clear") | |
def user_fn(user_msg, history): | |
return "", history + [{"role": "user", "content": user_msg}] | |
msg.submit( | |
user_fn, [msg, chatbot], [msg, chatbot], queue=False | |
).then( | |
chat_fn, [chatbot, enable_think, temperature, top_p, top_k, repetition_penalty, max_new_tokens], chatbot | |
) | |
clear.click(lambda: None, None, chatbot, queue=False) | |
demo.queue().launch() | |