import gradio as gr from transformers import AutoTokenizer from optimum.intel import OVModelForCausalLM from sentence_transformers import SentenceTransformer import faiss import numpy as np import warnings warnings.filterwarnings("ignore", category=DeprecationWarning, message="__array__ implementation doesn't accept a copy keyword") # 載入 OpenVINO 語言模型 model_id = "hsuwill000/DeepSeek-R1-Distill-Qwen-1.5B-openvino" print("Loading model...") model = OVModelForCausalLM.from_pretrained(model_id, device_map="auto") print("Loading tokenizer...") tokenizer = AutoTokenizer.from_pretrained(model_id, use_fast=True) # 載入向量模型 (用來將文本轉換為向量) encoder = SentenceTransformer("all-MiniLM-L6-v2") # FAQ 知識庫 (問題 + 回答) faq_data = [ ("What is FAISS?", "FAISS is a library for efficient similarity search and clustering of dense vectors."), ("How does FAISS work?", "FAISS uses indexing structures to quickly retrieve the nearest neighbors of a query vector."), ("Can FAISS run on GPU?", "Yes, FAISS supports GPU acceleration for faster computation."), ("What is OpenVINO?", "OpenVINO is an inference engine optimized for Intel hardware."), ("How to fine-tune a model?", "Fine-tuning involves training a model on a specific dataset to adapt it to a particular task."), ("What is the best way to optimize inference speed?", "Using quantization and model distillation can significantly improve inference speed.") ] # 轉換 FAQ 問題為向量 faq_questions = [q for q, _ in faq_data] faq_answers = [a for _, a in faq_data] faq_vectors = np.array(encoder.encode(faq_questions)).astype("float32") # 建立 FAISS 索引 d = faq_vectors.shape[1] # 向量維度 index = faiss.IndexFlatL2(d) index.add(faq_vectors) # 對話歷史記錄 history = [] # 查詢函數 (先檢索 FAQ,無匹配則交給模型) def respond(prompt): global history # 將輸入轉換為向量,並用 FAISS 查詢 query_vector = np.array(encoder.encode([prompt])).astype("float32") D, I = index.search(query_vector, 1) # 找最相近的 FAQ if D[0][0] < 1.0: # 設定相似度閾值 (數值越低代表越相似) (5.0太大 啥問題都會丟給FAISS) response = faq_answers[I[0][0]] # 直接回應 FAQ 答案 else: # 若 FAQ 沒有匹配,則使用語言模型 messages = [{"role": "system", "content": "Answer the question in English only."}] for user_text, assistant_text in history: messages.append({"role": "user", "content": user_text}) messages.append({"role": "assistant", "content": assistant_text}) messages.append({"role": "user", "content": prompt}) text = tokenizer.apply_chat_template(messages, tokenize=False, add_generation_prompt=True) model_inputs = tokenizer([text], return_tensors="pt").to(model.device) generated_ids = model.generate( **model_inputs, max_new_tokens=512, temperature=0.7, top_p=0.9, do_sample=True ) response = tokenizer.batch_decode(generated_ids, skip_special_tokens=True)[0].strip() history.append((prompt, response)) return response # 清除歷史記錄 def clear_history(): global history history = [] return "History cleared!" # Gradio 介面 with gr.Blocks() as demo: gr.Markdown("# DeepSeek-R1-Distill-Qwen-1.5B-openvino with history,FAISS ") with gr.Tabs(): with gr.TabItem("聊天"): chat_if = gr.Interface( fn=respond, inputs=gr.Textbox(label="Prompt", placeholder="請輸入訊息..."), outputs=gr.Textbox(label="Response", interactive=False), api_name="hchat", title="DeepSeek-R1 with FAISS FAQ", description="This chatbot first searches an FAQ database using FAISS, then responds using a language model if no match is found." ) with gr.Row(): clear_button = gr.Button("🧹 Clear History") clear_button.click(fn=clear_history, inputs=[], outputs=[]) if __name__ == "__main__": print("Launching Gradio app...") demo.launch(server_name="0.0.0.0", server_port=7860, share=True)