import numpy as np import gradio as gr import onnxruntime as rt import onnxruntime_genai as og so = rt.SessionOptions() so.intra_op_num_threads = 2 so.inter_op_num_threads = 1 so.add_session_config_entry("session.intra_op.allow_spinning", "0") model = og.Model("/phi4_model/cpu_and_mobile/cpu-int4-rtn-block-32-acc-level-4") tokenizer = og.Tokenizer(model) def respond(message, history): prompt = "".join(f"<|user|>{u}<|end|><|assistant|>{a}<|end|>" for u, a in history) prompt += f"<|user|>{message}<|end|><|assistant|>" input_ids = tokenizer.encode(prompt) params = og.GeneratorParams(model) params.set_input_sequences(input_ids[np.newaxis, :]) params.set_search_option("max_length", input_ids.shape[0] + 256) params.set_search_option("do_sample", False) output_ids = model.generate(params) gen = output_ids[0, input_ids.shape[0]:] return tokenizer.decode(gen.astype(np.int32)) gr.ChatInterface(respond, title="🧠 Phi‑4 ONNX Chat (2‑Core)")\ .launch(server_name="0.0.0.0", server_port=7860)