Rsnarsna's picture
Update app.py
c319001 verified
import numpy as np
import gradio as gr
import onnxruntime as rt
import onnxruntime_genai as og
so = rt.SessionOptions()
so.intra_op_num_threads = 2
so.inter_op_num_threads = 1
so.add_session_config_entry("session.intra_op.allow_spinning", "0")
model = og.Model("/phi4_model/cpu_and_mobile/cpu-int4-rtn-block-32-acc-level-4")
tokenizer = og.Tokenizer(model)
def respond(message, history):
prompt = "".join(f"<|user|>{u}<|end|><|assistant|>{a}<|end|>" for u, a in history)
prompt += f"<|user|>{message}<|end|><|assistant|>"
input_ids = tokenizer.encode(prompt)
params = og.GeneratorParams(model)
params.set_input_sequences(input_ids[np.newaxis, :])
params.set_search_option("max_length", input_ids.shape[0] + 256)
params.set_search_option("do_sample", False)
output_ids = model.generate(params)
gen = output_ids[0, input_ids.shape[0]:]
return tokenizer.decode(gen.astype(np.int32))
gr.ChatInterface(respond, title="🧠 Phi‑4 ONNX Chat (2‑Core)")\
.launch(server_name="0.0.0.0", server_port=7860)