import gc import gradio as gr # import torch # from transformers import AutoTokenizer, AutoModelForCausalLM #, HqqConfig # # # quant_config = HqqConfig(nbits=8, group_size=64) # MODEL_ID = "HuggingFaceTB/SmolLM3-3B" # DEVICE = "cuda" if torch.cuda.is_available() else "cpu" # print("Loading tokenizer & model…") # tokenizer = AutoTokenizer.from_pretrained(MODEL_ID) # # # model = AutoModelForCausalLM.from_pretrained(MODEL_ID, torch_dtype=torch.bfloat16).to(DEVICE) # model =\ # AutoModelForCausalLM\ # .from_pretrained( # MODEL_ID, # torch_dtype=torch.float16, # # device_map="cuda", # # quantization_config=quant_config # ).to(DEVICE) # gc.collect() ######### import torch from transformers import TorchAoConfig, AutoModelForCausalLM, AutoTokenizer from torchao.quantization import Float8DynamicActivationFloat8WeightConfig, Float8WeightOnlyConfig # quant_config = Float8WeightOnlyConfig() quant_config = Float8DynamicActivationFloat8WeightConfig() quantization_config = TorchAoConfig(quant_type=quant_config) MODEL_ID = "HuggingFaceTB/SmolLM3-3B" tokenizer = AutoTokenizer.from_pretrained(MODEL_ID) model = AutoModelForCausalLM.from_pretrained( MODEL_ID, torch_dtype="auto", device_map="auto", quantization_config=quantization_config) gc.collect() ######### # from unsloth import FastLanguageModel # model, tokenizer = FastLanguageModel.from_pretrained( # "unsloth/Llama-3.2-3B-Instruct-bnb-4bit", # max_seq_length=128_000, # load_in_4bit=True # ) ######### # import gc # import gradio as gr # from transformers import AutoTokenizer # from optimum.onnxruntime import ORTModelForCausalLM, ORTQuantizer # from optimum.onnxruntime.configuration import AutoQuantizationConfig # MODEL_NAME = "HuggingFaceTB/SmolLM3-3B" # tokenizer = AutoTokenizer.from_pretrained(MODEL_NAME) # model = ORTModelForCausalLM.from_pretrained(MODEL_NAME, export=True) # print("Creating quant config") # qconfig = AutoQuantizationConfig.avx512_vnni(is_static=False, per_channel=True) # print("Creating quant config successful") # print("Creating quantizer") # quantizer = ORTQuantizer.from_pretrained(model) # print("Creating quantizer successful") # # Step 4: Perform quantization saving output in a new directory # quantized_model_dir = "./quantized_model" # print("Starting quantization...") # quantizer.quantize(save_dir=quantized_model_dir, quantization_config=qconfig) # print("Quantization was successful. Garbage collecting...") # del(quantizer) # del(qconfig) # del(model) # Run garbage collection again to release memory from quantizer objects gc.collect() # # Step 5: Load the quantized ONNX model for inference # print("Loading quantized ONNX model for inference...") # model = ORTModelForCausalLM.from_pretrained(quantized_model_dir) # print("Loading model was succcessful. Garbage collecting.") # Garbage collection again after final loading gc.collect() ######### # print("Loading tokenizer & model…") # import gradio as gr # from transformers import AutoTokenizer # from optimum.onnxruntime import ORTModelForCausalLM # MODEL_ID = "HuggingFaceTB/SmolLM3-3B" # tokenizer = AutoTokenizer.from_pretrained(MODEL_ID) # model = ORTModelForCausalLM.from_pretrained(MODEL_ID, export=True, quantize=True) ######### # ------------------------------------------------- # Optional tool(s) # ------------------------------------------------- # TOOLS = [{ # "name": "get_weather", # "description": "Get the current weather in a given city", # "parameters": { # "type": "object", # "properties": { # "city": {"type": "string", "description": "City name"} # }, # "required": ["city"] # } # }] # ------------------------------------------------- # Helpers # ------------------------------------------------- def build_messages(history, enable_thinking: bool): """Convert Gradio history to the chat template.""" messages = [] for h in history: messages.append({"role": h["role"], "content": h["content"]}) # Add system instruction for mode system_flag = "/think" if enable_thinking else "/no_think" messages.insert(0, {"role": "system", "content": system_flag}) return messages def chat_fn(history, enable_thinking, temperature, top_p, top_k, repetition_penalty, max_new_tokens): """Generate a streaming response.""" messages = build_messages(history, enable_thinking) text = tokenizer.apply_chat_template( messages, tokenize=False, add_generation_prompt=True, # xml_tools=TOOLS ) inputs = tokenizer(text, return_tensors="pt") gc.collect() with torch.inference_mode(): streamer = model.generate( **inputs, max_new_tokens=max_new_tokens, do_sample=True, temperature=temperature, top_p=top_p, top_k=top_k, repetition_penalty=repetition_penalty, pad_token_id=tokenizer.eos_token_id, streamer=None # we'll yield manually ) gc.collect() output_ids = streamer[0][len(inputs.input_ids[0]):] response = tokenizer.decode(output_ids, skip_special_tokens=True) if isinstance(response, str): response = response.replace('',"# <think>").replace('',"</think>") elif isinstance(response,list): response = [paper.replace('',"# <think>").replace('',"</think>") for paper in response] else: raise ValueError("Tokenizer response seems malformed; Not a string, nor a list?!?!") # streaming char-by-char history.append({"role": "assistant", "content": ""}) for ch in response: history[-1]["content"] += ch yield history # ------------------------------------------------- # Blocks UI # ------------------------------------------------- with gr.Blocks(title="SmolLM3-3B Chat") as demo: gr.Markdown("## 🤖 SmolLM3-3B Chatbot (Streaming)") with gr.Row(): enable_think = gr.Checkbox(label="Enable Extended Thinking (/think)", value=True) temperature = gr.Slider(0.0, 1.0, value=0.6, label="Temperature") top_p = gr.Slider(0.0, 1.0, value=0.95, label="Top-p") top_k = gr.Slider(1,40,value=20,label="Top_k") repetition_penalty = gr.Slider(1.0,1.4,value=1.1,label="Repetition_Penalty") max_new_tokens = gr.Slider(1000,32768,value=32768,label="Max_New_Tokens") chatbot = gr.Chatbot(type="messages") msg = gr.Textbox(placeholder="Type your message here…", lines=1) clear = gr.Button("Clear") def user_fn(user_msg, history): return "", history + [{"role": "user", "content": user_msg}] msg.submit( user_fn, [msg, chatbot], [msg, chatbot], queue=False ).then( chat_fn, [chatbot, enable_think, temperature, top_p, top_k, repetition_penalty, max_new_tokens], chatbot ) clear.click(lambda: None, None, chatbot, queue=False) demo.queue().launch()