File size: 4,289 Bytes
4d871c7
76cb536
0801ebc
a12a90a
 
 
182208d
 
3f48b5b
4d871c7
a12a90a
6453441
7160766
8b4afb4
7160766
155b74f
4d871c7
a12a90a
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
3f48b5b
 
 
a12a90a
3f48b5b
a12a90a
3f48b5b
a12a90a
 
 
3f48b5b
a12a90a
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
3f48b5b
76da388
7160766
3f48b5b
 
 
 
 
 
 
155b74f
a12a90a
 
0801ebc
 
7fae2e6
0801ebc
ec76eef
7fae2e6
7913ae5
a12a90a
 
0801ebc
a12a90a
3f48b5b
 
df8b6cb
3f48b5b
3486524
4d871c7
e5d3a7a
3f48b5b
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
import gradio as gr
from transformers import AutoTokenizer
from optimum.intel import OVModelForCausalLM
from sentence_transformers import SentenceTransformer
import faiss
import numpy as np
import warnings

warnings.filterwarnings("ignore", category=DeprecationWarning, message="__array__ implementation doesn't accept a copy keyword")

# 載入 OpenVINO 語言模型
model_id = "hsuwill000/DeepSeek-R1-Distill-Qwen-1.5B-openvino"
print("Loading model...")
model = OVModelForCausalLM.from_pretrained(model_id, device_map="auto")
print("Loading tokenizer...")
tokenizer = AutoTokenizer.from_pretrained(model_id, use_fast=True)

# 載入向量模型 (用來將文本轉換為向量)
encoder = SentenceTransformer("all-MiniLM-L6-v2")

# FAQ 知識庫 (問題 + 回答)
faq_data = [
    ("What is FAISS?", "FAISS is a library for efficient similarity search and clustering of dense vectors."),
    ("How does FAISS work?", "FAISS uses indexing structures to quickly retrieve the nearest neighbors of a query vector."),
    ("Can FAISS run on GPU?", "Yes, FAISS supports GPU acceleration for faster computation."),
    ("What is OpenVINO?", "OpenVINO is an inference engine optimized for Intel hardware."),
    ("How to fine-tune a model?", "Fine-tuning involves training a model on a specific dataset to adapt it to a particular task."),
    ("What is the best way to optimize inference speed?", "Using quantization and model distillation can significantly improve inference speed.")
]

# 轉換 FAQ 問題為向量
faq_questions = [q for q, _ in faq_data]
faq_answers = [a for _, a in faq_data]
faq_vectors = np.array(encoder.encode(faq_questions)).astype("float32")

# 建立 FAISS 索引
d = faq_vectors.shape[1]  # 向量維度
index = faiss.IndexFlatL2(d)
index.add(faq_vectors)

# 對話歷史記錄
history = []

# 查詢函數 (先檢索 FAQ,無匹配則交給模型)
def respond(prompt):
    global history
    
    # 將輸入轉換為向量,並用 FAISS 查詢
    query_vector = np.array(encoder.encode([prompt])).astype("float32")
    D, I = index.search(query_vector, 1)  # 找最相近的 FAQ
    
    if D[0][0] < 1.0:  # 設定相似度閾值 (數值越低代表越相似)  (5.0太大 啥問題都會丟給FAISS)
        response = faq_answers[I[0][0]]  # 直接回應 FAQ 答案
    else:
        # 若 FAQ 沒有匹配,則使用語言模型
        messages = [{"role": "system", "content": "Answer the question in English only."}]
        for user_text, assistant_text in history:
            messages.append({"role": "user", "content": user_text})
            messages.append({"role": "assistant", "content": assistant_text})
        messages.append({"role": "user", "content": prompt})
        
        text = tokenizer.apply_chat_template(messages, tokenize=False, add_generation_prompt=True)
        model_inputs = tokenizer([text], return_tensors="pt").to(model.device)
        generated_ids = model.generate(
            **model_inputs,
            max_new_tokens=512,
            temperature=0.7,
            top_p=0.9,
            do_sample=True
        )
        response = tokenizer.batch_decode(generated_ids, skip_special_tokens=True)[0].strip()
        
    history.append((prompt, response))
    return response

# 清除歷史記錄
def clear_history():
    global history
    history = []
    return "History cleared!"

# Gradio 介面
with gr.Blocks() as demo:
    gr.Markdown("# DeepSeek-R1-Distill-Qwen-1.5B-openvino with history,FAISS ")
    
    with gr.Tabs():
        with gr.TabItem("聊天"):
            chat_if = gr.Interface(
                fn=respond,
                inputs=gr.Textbox(label="Prompt", placeholder="請輸入訊息..."),
                outputs=gr.Textbox(label="Response", interactive=False),
                api_name="hchat",
                title="DeepSeek-R1 with FAISS FAQ",
                description="This chatbot first searches an FAQ database using FAISS, then responds using a language model if no match is found."
            )
    
    with gr.Row():
        clear_button = gr.Button("🧹 Clear History")
    
    clear_button.click(fn=clear_history, inputs=[], outputs=[])

if __name__ == "__main__":
    print("Launching Gradio app...")
    demo.launch(server_name="0.0.0.0", server_port=7860, share=True)