Spaces:
Running
Running
File size: 1,879 Bytes
1af7192 c786907 72bde98 d6f0fdc 354950f 72bde98 6ad97d2 c786907 82fc211 72bde98 058ff15 82fc211 208aae9 b18b8b5 208aae9 354950f c786907 9039202 39aecff 9039202 39aecff 82fc211 72bde98 058ff15 e177707 72bde98 8886195 72bde98 058ff15 72bde98 ac387f0 72bde98 1af7192 72bde98 50398e9 8e9ef4f 1af7192 |
1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 |
import huggingface_hub as hf_hub
import time
import openvino_genai as ov_genai
import numpy as np
import gradio as gr
import re
import gc
from openvino_genai import GenerationConfig
#hf_hub.snapshot_download(repo_id="OpenVINO/DeepSeek-R1-Distill-Qwen-1.5B-int4-ov", local_dir="ov", local_dir_use_symlinks=False)
hf_hub.snapshot_download(repo_id="hsuwill000/Llama-3.1-TAIDE-LX-8B-Chat_int4_ov", local_dir="ov", local_dir_use_symlinks=False)
# 初始化模型
device = "CPU"
InUsed_model_name = "ov"
model_path = f"./{InUsed_model_name}" # 加上目錄路徑
pipe = ov_genai.LLMPipeline(model_path, device)
tokenizer = pipe.get_tokenizer()
tokenizer.set_chat_template(tokenizer.chat_template)
# 定義你要偵測的結束詞(注意是最終解碼後出現的文字)
config = GenerationConfig(
stop_strings=set(["<|eot_id|>"]) # ✅ 這是 set
)
# Create a streamer function
def streamer(subword):
print(subword, end='', flush=True)
# Return flag corresponds whether generation should be stopped.
return ov_genai.StreamingStatus.RUNNING
# 推理函式
def generate_response(prompt, model_name):
global pipe, tokenizer
pipe = ov_genai.LLMPipeline("ov", device)
tokenizer = pipe.get_tokenizer()
generated = pipe.generate([prompt], config, streamer)
tokenpersec = f'{generated.perf_metrics.get_throughput().mean:.2f}'
print(f"\nModel:{model_name} TPS:{tokenpersec}\n")
return tokenpersec, generated.texts
# 建立 Gradio 介面
demo = gr.Interface(
fn=generate_response,
inputs=[
gr.Textbox(lines=5, label="輸入提示 (Prompt)")
],
outputs=[
gr.Textbox(label="tokens/sec"),
gr.Textbox(label="回應"),
],
title="Qwen3 Model Inference",
description="基於 Qwen3 推理應用,支援思考過程分離與 GUI。"
)
if __name__ == "__main__":
demo.launch()
|