qwen3_test / app.py
hsuwill000's picture
Update app.py
6ad97d2 verified
import huggingface_hub as hf_hub
import time
import openvino_genai as ov_genai
import numpy as np
import gradio as gr
import re
import gc
from openvino_genai import GenerationConfig
#hf_hub.snapshot_download(repo_id="OpenVINO/DeepSeek-R1-Distill-Qwen-1.5B-int4-ov", local_dir="ov", local_dir_use_symlinks=False)
hf_hub.snapshot_download(repo_id="hsuwill000/Llama-3.1-TAIDE-LX-8B-Chat_int4_ov", local_dir="ov", local_dir_use_symlinks=False)
# 初始化模型
device = "CPU"
InUsed_model_name = "ov"
model_path = f"./{InUsed_model_name}" # 加上目錄路徑
pipe = ov_genai.LLMPipeline(model_path, device)
tokenizer = pipe.get_tokenizer()
tokenizer.set_chat_template(tokenizer.chat_template)
# 定義你要偵測的結束詞(注意是最終解碼後出現的文字)
config = GenerationConfig(
stop_strings=set(["<|eot_id|>"]) # ✅ 這是 set
)
# Create a streamer function
def streamer(subword):
print(subword, end='', flush=True)
# Return flag corresponds whether generation should be stopped.
return ov_genai.StreamingStatus.RUNNING
# 推理函式
def generate_response(prompt, model_name):
global pipe, tokenizer
pipe = ov_genai.LLMPipeline("ov", device)
tokenizer = pipe.get_tokenizer()
generated = pipe.generate([prompt], config, streamer)
tokenpersec = f'{generated.perf_metrics.get_throughput().mean:.2f}'
print(f"\nModel:{model_name} TPS:{tokenpersec}\n")
return tokenpersec, generated.texts
# 建立 Gradio 介面
demo = gr.Interface(
fn=generate_response,
inputs=[
gr.Textbox(lines=5, label="輸入提示 (Prompt)")
],
outputs=[
gr.Textbox(label="tokens/sec"),
gr.Textbox(label="回應"),
],
title="Qwen3 Model Inference",
description="基於 Qwen3 推理應用,支援思考過程分離與 GUI。"
)
if __name__ == "__main__":
demo.launch()