File size: 1,879 Bytes
1af7192
c786907
72bde98
 
 
 
 
d6f0fdc
354950f
72bde98
6ad97d2
 
c786907
82fc211
72bde98
058ff15
82fc211
 
208aae9
b18b8b5
208aae9
354950f
 
 
 
c786907
9039202
39aecff
 
9039202
 
 
39aecff
82fc211
72bde98
058ff15
 
 
 
 
 
 
e177707
72bde98
 
8886195
72bde98
 
 
 
058ff15
72bde98
 
 
ac387f0
72bde98
 
1af7192
72bde98
50398e9
8e9ef4f
1af7192
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63

import huggingface_hub as hf_hub
import time
import openvino_genai as ov_genai
import numpy as np
import gradio as gr
import re
import gc
from openvino_genai import GenerationConfig

#hf_hub.snapshot_download(repo_id="OpenVINO/DeepSeek-R1-Distill-Qwen-1.5B-int4-ov", local_dir="ov", local_dir_use_symlinks=False)
hf_hub.snapshot_download(repo_id="hsuwill000/Llama-3.1-TAIDE-LX-8B-Chat_int4_ov", local_dir="ov", local_dir_use_symlinks=False)

# 初始化模型
device = "CPU"
InUsed_model_name = "ov"
model_path = f"./{InUsed_model_name}"  # 加上目錄路徑
pipe = ov_genai.LLMPipeline(model_path, device)

tokenizer = pipe.get_tokenizer()
tokenizer.set_chat_template(tokenizer.chat_template)
# 定義你要偵測的結束詞(注意是最終解碼後出現的文字)
config = GenerationConfig(
    stop_strings=set(["<|eot_id|>"])  # ✅ 這是 set
)

# Create a streamer function
def streamer(subword):
    print(subword, end='', flush=True)
    # Return flag corresponds whether generation should be stopped.
    return ov_genai.StreamingStatus.RUNNING

    
# 推理函式
def generate_response(prompt, model_name):
    global pipe, tokenizer
    pipe = ov_genai.LLMPipeline("ov", device)
    tokenizer = pipe.get_tokenizer()
    generated = pipe.generate([prompt], config, streamer)
    tokenpersec = f'{generated.perf_metrics.get_throughput().mean:.2f}'
    print(f"\nModel:{model_name} TPS:{tokenpersec}\n")
    
    return tokenpersec, generated.texts

# 建立 Gradio 介面


demo = gr.Interface(
    fn=generate_response,
    inputs=[
        gr.Textbox(lines=5, label="輸入提示 (Prompt)")
    ],
    outputs=[
        gr.Textbox(label="tokens/sec"),
        gr.Textbox(label="回應"),
    ],
    title="Qwen3 Model Inference",
    description="基於 Qwen3 推理應用,支援思考過程分離與 GUI。"
)

if __name__ == "__main__":
    demo.launch()