robiro commited on
Commit
0b70ac0
·
verified ·
1 Parent(s): d8247dd

Create app.py

Browse files
Files changed (1) hide show
  1. app.py +119 -0
app.py ADDED
@@ -0,0 +1,119 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import gradio as gr
2
+ from llama_cpp import Llama
3
+ from huggingface_hub import hf_hub_download
4
+ import os
5
+
6
+ # --- Configuration ---
7
+ MODEL_NAME_OR_PATH = "unsloth/DeepSeek-R1-0528-Qwen3-8B-GGUF"
8
+ # Select a specific GGUF file. Check the "Files and versions" tab on Hugging Face
9
+ # For this model, a common choice might be a Q4_K_M quant. Let's pick one.
10
+ # Example: "DeepSeek-R1-0528-Qwen3-8B-Q4_K_M.gguf"
11
+ # You MUST check the Hugging Face repo for the exact filename you want to use.
12
+ # Let's assume this one exists for the example. Replace if needed.
13
+ MODEL_FILE = "DeepSeek-R1-0528-Qwen3-8B-Q4_K_M.gguf" # MAKE SURE THIS FILENAME IS CORRECT on HF
14
+
15
+ # Download the model file if it doesn't exist
16
+ if not os.path.exists(MODEL_FILE):
17
+ print(f"Downloading {MODEL_FILE} from {MODEL_NAME_OR_PATH}...")
18
+ try:
19
+ hf_hub_download(
20
+ repo_id=MODEL_NAME_OR_PATH,
21
+ filename=MODEL_FILE,
22
+ local_dir=".", # Download to current directory
23
+ local_dir_use_symlinks=False # Good practice for GGUF
24
+ )
25
+ print("Download complete.")
26
+ except Exception as e:
27
+ print(f"Error downloading model: {e}")
28
+ print("Please ensure the MODEL_FILE name is correct and available in the repository.")
29
+ exit()
30
+ else:
31
+ print(f"Model file {MODEL_FILE} already exists.")
32
+
33
+ # --- Load the GGUF Model ---
34
+ # Adjust n_gpu_layers if you have a GPU-enabled llama-cpp-python
35
+ # -1 means all possible layers to GPU, 0 means CPU only.
36
+ try:
37
+ print("Loading model...")
38
+ llm = Llama(
39
+ model_path=MODEL_FILE,
40
+ n_ctx=2048, # Context window size
41
+ n_threads=None, # None for llama.cpp to auto-detect, or set a specific number
42
+ n_gpu_layers=0 # Change to -1 or a positive number if you have GPU support
43
+ # and want to offload layers to GPU.
44
+ )
45
+ print("Model loaded successfully.")
46
+ except Exception as e:
47
+ print(f"Error loading Llama model: {e}")
48
+ print("Ensure llama-cpp-python is installed correctly and the model file is valid.")
49
+ exit()
50
+
51
+ # --- Chat Function ---
52
+ def predict(message, history):
53
+ history_llama_format = []
54
+ for human, ai in history:
55
+ history_llama_format.append({"role": "user", "content": human})
56
+ history_llama_format.append({"role": "assistant", "content": ai})
57
+ history_llama_format.append({"role": "user", "content": message})
58
+
59
+ # Qwen models often use a specific chat template.
60
+ # We need to format the prompt correctly for the model.
61
+ # llama-cpp-python's create_chat_completion can handle this if the model
62
+ # has chat template info embedded, or you might need to construct it manually.
63
+ # For simpler generation:
64
+ # prompt = f"<|im_start|>user\n{message}<|im_end|>\n<|im_start|>assistant\n"
65
+
66
+ # Using create_chat_completion for a more robust approach if model supports it
67
+ try:
68
+ response = llm.create_chat_completion(
69
+ messages=history_llama_format,
70
+ # temperature=0.7, # Example: Adjust for creativity
71
+ # top_p=0.9, # Example: Nucleus sampling
72
+ # max_tokens=256 # Max tokens to generate for the response
73
+ )
74
+ assistant_response = response['choices'][0]['message']['content']
75
+ except Exception as e:
76
+ print(f"Error during model inference: {e}")
77
+ assistant_response = "Sorry, I encountered an error."
78
+ # Fallback to simpler generation if create_chat_completion fails or is not well-supported for this GGUF
79
+ # This is a very basic prompt construction, might need adjustment based on Qwen's specific format
80
+ prompt = ""
81
+ for entry in history_llama_format:
82
+ if entry["role"] == "user":
83
+ prompt += f"<|im_start|>user\n{entry['content']}<|im_end|>\n"
84
+ elif entry["role"] == "assistant":
85
+ prompt += f"<|im_start|>assistant\n{entry['content']}<|im_end|>\n"
86
+ prompt += "<|im_start|>assistant\n" # Start of assistant's turn
87
+
88
+ try:
89
+ output = llm(
90
+ prompt,
91
+ max_tokens=256,
92
+ stop=["<|im_end|>", "<|im_start|>user"], # Stop generation at these tokens
93
+ echo=False # Don't echo the prompt
94
+ )
95
+ assistant_response = output['choices'][0]['text'].strip()
96
+ except Exception as e_fallback:
97
+ print(f"Error during fallback model inference: {e_fallback}")
98
+ assistant_response = "Sorry, I encountered an error during fallback."
99
+
100
+
101
+ return assistant_response
102
+
103
+ # --- Gradio Interface ---
104
+ iface = gr.ChatInterface(
105
+ fn=predict,
106
+ title="Unsloth DeepSeek-Qwen3-8B GGUF Chat",
107
+ description="Chat with the unsloth/DeepSeek-R1-0528-Qwen3-8B-GGUF model.",
108
+ examples=[
109
+ ["Hello, how are you?"],
110
+ ["What is the capital of France?"],
111
+ ["Write a short story about a friendly robot."]
112
+ ],
113
+ chatbot=gr.Chatbot(height=600)
114
+ )
115
+
116
+ # --- Launch the App ---
117
+ if __name__ == "__main__":
118
+ print("Launching Gradio interface...")
119
+ iface.launch()