david-thrower's picture
Update app.py
4402f0f verified
import gc
import gradio as gr
# import torch
# from transformers import AutoTokenizer, AutoModelForCausalLM #, HqqConfig
# # # quant_config = HqqConfig(nbits=8, group_size=64)
# MODEL_ID = "HuggingFaceTB/SmolLM3-3B"
# DEVICE = "cuda" if torch.cuda.is_available() else "cpu"
# print("Loading tokenizer & model…")
# tokenizer = AutoTokenizer.from_pretrained(MODEL_ID)
# # # model = AutoModelForCausalLM.from_pretrained(MODEL_ID, torch_dtype=torch.bfloat16).to(DEVICE)
# model =\
# AutoModelForCausalLM\
# .from_pretrained(
# MODEL_ID,
# torch_dtype=torch.float16,
# # device_map="cuda",
# # quantization_config=quant_config
# ).to(DEVICE)
# gc.collect()
#########
import torch
from transformers import TorchAoConfig, AutoModelForCausalLM, AutoTokenizer
from torchao.quantization import Float8DynamicActivationFloat8WeightConfig, Float8WeightOnlyConfig
# quant_config = Float8WeightOnlyConfig()
quant_config = Float8DynamicActivationFloat8WeightConfig()
quantization_config = TorchAoConfig(quant_type=quant_config)
MODEL_ID = "HuggingFaceTB/SmolLM3-3B"
tokenizer = AutoTokenizer.from_pretrained(MODEL_ID)
model = AutoModelForCausalLM.from_pretrained(
MODEL_ID,
torch_dtype="auto",
device_map="auto",
quantization_config=quantization_config)
gc.collect()
#########
# from unsloth import FastLanguageModel
# model, tokenizer = FastLanguageModel.from_pretrained(
# "unsloth/Llama-3.2-3B-Instruct-bnb-4bit",
# max_seq_length=128_000,
# load_in_4bit=True
# )
#########
# import gc
# import gradio as gr
# from transformers import AutoTokenizer
# from optimum.onnxruntime import ORTModelForCausalLM, ORTQuantizer
# from optimum.onnxruntime.configuration import AutoQuantizationConfig
# MODEL_NAME = "HuggingFaceTB/SmolLM3-3B"
# tokenizer = AutoTokenizer.from_pretrained(MODEL_NAME)
# model = ORTModelForCausalLM.from_pretrained(MODEL_NAME, export=True)
# print("Creating quant config")
# qconfig = AutoQuantizationConfig.avx512_vnni(is_static=False, per_channel=True)
# print("Creating quant config successful")
# print("Creating quantizer")
# quantizer = ORTQuantizer.from_pretrained(model)
# print("Creating quantizer successful")
# # Step 4: Perform quantization saving output in a new directory
# quantized_model_dir = "./quantized_model"
# print("Starting quantization...")
# quantizer.quantize(save_dir=quantized_model_dir, quantization_config=qconfig)
# print("Quantization was successful. Garbage collecting...")
# del(quantizer)
# del(qconfig)
# del(model)
# Run garbage collection again to release memory from quantizer objects
gc.collect()
# # Step 5: Load the quantized ONNX model for inference
# print("Loading quantized ONNX model for inference...")
# model = ORTModelForCausalLM.from_pretrained(quantized_model_dir)
# print("Loading model was succcessful. Garbage collecting.")
# Garbage collection again after final loading
gc.collect()
#########
# print("Loading tokenizer & model…")
# import gradio as gr
# from transformers import AutoTokenizer
# from optimum.onnxruntime import ORTModelForCausalLM
# MODEL_ID = "HuggingFaceTB/SmolLM3-3B"
# tokenizer = AutoTokenizer.from_pretrained(MODEL_ID)
# model = ORTModelForCausalLM.from_pretrained(MODEL_ID, export=True, quantize=True)
#########
# -------------------------------------------------
# Optional tool(s)
# -------------------------------------------------
# TOOLS = [{
# "name": "get_weather",
# "description": "Get the current weather in a given city",
# "parameters": {
# "type": "object",
# "properties": {
# "city": {"type": "string", "description": "City name"}
# },
# "required": ["city"]
# }
# }]
# -------------------------------------------------
# Helpers
# -------------------------------------------------
def build_messages(history, enable_thinking: bool):
"""Convert Gradio history to the chat template."""
messages = []
for h in history:
messages.append({"role": h["role"], "content": h["content"]})
# Add system instruction for mode
system_flag = "/think" if enable_thinking else "/no_think"
messages.insert(0, {"role": "system", "content": system_flag})
return messages
def chat_fn(history, enable_thinking, temperature, top_p, top_k, repetition_penalty, max_new_tokens):
"""Generate a streaming response."""
messages = build_messages(history, enable_thinking)
text = tokenizer.apply_chat_template(
messages,
tokenize=False,
add_generation_prompt=True,
# xml_tools=TOOLS
)
inputs = tokenizer(text, return_tensors="pt")
gc.collect()
with torch.inference_mode():
streamer = model.generate(
**inputs,
max_new_tokens=max_new_tokens,
do_sample=True,
temperature=temperature,
top_p=top_p,
top_k=top_k,
repetition_penalty=repetition_penalty,
pad_token_id=tokenizer.eos_token_id,
streamer=None # we'll yield manually
)
gc.collect()
output_ids = streamer[0][len(inputs.input_ids[0]):]
response = tokenizer.decode(output_ids, skip_special_tokens=True)
if isinstance(response, str):
response = response.replace('<think>',"# &lt;think&gt;").replace('</think>',"&lt;/think&gt;")
elif isinstance(response,list):
response = [paper.replace('<think>',"# &lt;think&gt;").replace('</think>',"&lt;/think&gt;") for paper in response]
else:
raise ValueError("Tokenizer response seems malformed; Not a string, nor a list?!?!")
# streaming char-by-char
history.append({"role": "assistant", "content": ""})
for ch in response:
history[-1]["content"] += ch
yield history
# -------------------------------------------------
# Blocks UI
# -------------------------------------------------
with gr.Blocks(title="SmolLM3-3B Chat") as demo:
gr.Markdown("## 🤖 SmolLM3-3B Chatbot (Streaming)")
with gr.Row():
enable_think = gr.Checkbox(label="Enable Extended Thinking (/think)", value=True)
temperature = gr.Slider(0.0, 1.0, value=0.6, label="Temperature")
top_p = gr.Slider(0.0, 1.0, value=0.95, label="Top-p")
top_k = gr.Slider(1,40,value=20,label="Top_k")
repetition_penalty = gr.Slider(1.0,1.4,value=1.1,label="Repetition_Penalty")
max_new_tokens = gr.Slider(1000,32768,value=32768,label="Max_New_Tokens")
chatbot = gr.Chatbot(type="messages")
msg = gr.Textbox(placeholder="Type your message here…", lines=1)
clear = gr.Button("Clear")
def user_fn(user_msg, history):
return "", history + [{"role": "user", "content": user_msg}]
msg.submit(
user_fn, [msg, chatbot], [msg, chatbot], queue=False
).then(
chat_fn, [chatbot, enable_think, temperature, top_p, top_k, repetition_penalty, max_new_tokens], chatbot
)
clear.click(lambda: None, None, chatbot, queue=False)
demo.queue().launch()