robiro commited on
Commit
02d4edd
·
verified ·
1 Parent(s): feddae9

Update app.py

Browse files
Files changed (1) hide show
  1. app.py +357 -96
app.py CHANGED
@@ -2,118 +2,379 @@ import gradio as gr
2
  from llama_cpp import Llama
3
  from huggingface_hub import hf_hub_download
4
  import os
 
5
 
6
  # --- Configuration ---
7
- MODEL_NAME_OR_PATH = "unsloth/DeepSeek-R1-0528-Qwen3-8B-GGUF"
8
- # Select a specific GGUF file. Check the "Files and versions" tab on Hugging Face
9
- # For this model, a common choice might be a Q4_K_M quant. Let's pick one.
10
- # Example: "DeepSeek-R1-0528-Qwen3-8B-Q4_K_M.gguf"
11
- # You MUST check the Hugging Face repo for the exact filename you want to use.
12
- # Let's assume this one exists for the example. Replace if needed.
13
- MODEL_FILE = "DeepSeek-R1-0528-Qwen3-8B-Q4_K_M.gguf" # MAKE SURE THIS FILENAME IS CORRECT on HF
14
-
15
- # Download the model file if it doesn't exist
16
- if not os.path.exists(MODEL_FILE):
17
- print(f"Downloading {MODEL_FILE} from {MODEL_NAME_OR_PATH}...")
18
- try:
19
- hf_hub_download(
20
- repo_id=MODEL_NAME_OR_PATH,
21
- filename=MODEL_FILE,
22
- local_dir=".", # Download to current directory
23
- local_dir_use_symlinks=False # Good practice for GGUF
24
- )
25
- print("Download complete.")
26
- except Exception as e:
27
- print(f"Error downloading model: {e}")
28
- print("Please ensure the MODEL_FILE name is correct and available in the repository.")
29
- exit()
30
- else:
31
- print(f"Model file {MODEL_FILE} already exists.")
32
-
33
- # --- Load the GGUF Model ---
34
- # Adjust n_gpu_layers if you have a GPU-enabled llama-cpp-python
35
- # -1 means all possible layers to GPU, 0 means CPU only.
36
- try:
37
- print("Loading model...")
38
- llm = Llama(
39
- model_path=MODEL_FILE,
40
- n_ctx=2048, # Context window size
41
- n_threads=None, # None for llama.cpp to auto-detect, or set a specific number
42
- n_gpu_layers=0 # Change to -1 or a positive number if you have GPU support
43
- # and want to offload layers to GPU.
44
- )
45
- print("Model loaded successfully.")
46
- except Exception as e:
47
- print(f"Error loading Llama model: {e}")
48
- print("Ensure llama-cpp-python is installed correctly and the model file is valid.")
49
- exit()
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
50
 
51
  # --- Chat Function ---
52
- def predict(message, history):
53
- history_llama_format = []
54
- for human, ai in history:
55
- history_llama_format.append({"role": "user", "content": human})
56
- history_llama_format.append({"role": "assistant", "content": ai})
57
- history_llama_format.append({"role": "user", "content": message})
58
-
59
- # Qwen models often use a specific chat template.
60
- # We need to format the prompt correctly for the model.
61
- # llama-cpp-python's create_chat_completion can handle this if the model
62
- # has chat template info embedded, or you might need to construct it manually.
63
- # For simpler generation:
64
- # prompt = f"<|im_start|>user\n{message}<|im_end|>\n<|im_start|>assistant\n"
65
-
66
- # Using create_chat_completion for a more robust approach if model supports it
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
67
  try:
 
68
  response = llm.create_chat_completion(
69
- messages=history_llama_format,
70
- # temperature=0.7, # Example: Adjust for creativity
71
- # top_p=0.9, # Example: Nucleus sampling
72
- # max_tokens=256 # Max tokens to generate for the response
 
 
 
 
73
  )
74
- assistant_response = response['choices'][0]['message']['content']
75
- except Exception as e:
76
- print(f"Error during model inference: {e}")
77
- assistant_response = "Sorry, I encountered an error."
78
- # Fallback to simpler generation if create_chat_completion fails or is not well-supported for this GGUF
79
- # This is a very basic prompt construction, might need adjustment based on Qwen's specific format
 
 
80
  prompt = ""
81
- for entry in history_llama_format:
82
- if entry["role"] == "user":
83
- prompt += f"<|im_start|>user\n{entry['content']}<|im_end|>\n"
84
- elif entry["role"] == "assistant":
85
- prompt += f"<|im_start|>assistant\n{entry['content']}<|im_end|>\n"
86
- prompt += "<|im_start|>assistant\n" # Start of assistant's turn
 
 
 
 
87
 
88
  try:
89
  output = llm(
90
  prompt,
91
- max_tokens=256,
92
- stop=["<|im_end|>", "<|im_start|>user"], # Stop generation at these tokens
93
- echo=False # Don't echo the prompt
 
 
 
 
94
  )
95
- assistant_response = output['choices'][0]['text'].strip()
 
96
  except Exception as e_fallback:
97
- print(f"Error during fallback model inference: {e_fallback}")
98
- assistant_response = "Sorry, I encountered an error during fallback."
99
-
100
 
101
- return assistant_response
 
 
 
102
 
103
  # --- Gradio Interface ---
104
- iface = gr.ChatInterface(
105
- fn=predict,
106
- title="Unsloth DeepSeek-Qwen3-8B GGUF Chat",
107
- description="Chat with the unsloth/DeepSeek-R1-0528-Qwen3-8B-GGUF model.",
108
- examples=[
109
- ["Hello, how are you?"],
110
- ["What is the capital of France?"],
111
- ["Write a short story about a friendly robot."]
112
- ],
113
- chatbot=gr.Chatbot(height=600)
114
- )
115
-
116
- # --- Launch the App ---
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
117
  if __name__ == "__main__":
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
118
  print("Launching Gradio interface...")
119
- iface.launch()
 
 
 
 
2
  from llama_cpp import Llama
3
  from huggingface_hub import hf_hub_download
4
  import os
5
+ import time
6
 
7
  # --- Configuration ---
8
+ MODEL_REPO_ID = "unsloth/DeepSeek-R1-0528-Qwen3-8B-GGUF"
9
+ # IMPORTANT: Verify this filename exists in the "Files and versions" tab of the repo
10
+ MODEL_FILENAME = "DeepSeek-R1-0528-Qwen3-8B-Q4_K_M.gguf"
11
+ LOCAL_MODEL_PATH = f"./{MODEL_FILENAME}" # Download to current directory
12
+
13
+ # LLM Llama Parameters (adjust based on your Space's resources)
14
+ N_CTX = 2048 # Context window size. Default 2048. Max for this model is very large, but needs RAM.
15
+ N_THREADS = None # Number of CPU threads to use. None = Llama.cpp auto-detects.
16
+ # On smaller CPU Spaces (e.g., 2-4 cores), explicitly setting N_THREADS=2 or N_THREADS=4 might be beneficial.
17
+ N_GPU_LAYERS = 0 # Number of layers to offload to GPU. 0 for CPU-only. -1 for all possible.
18
+ VERBOSE_LLAMA = True # Enable verbose logging from llama.cpp
19
+
20
+ # Generation parameters
21
+ DEFAULT_MAX_NEW_TOKENS = 512
22
+ DEFAULT_TEMPERATURE = 0.7
23
+ DEFAULT_TOP_P = 0.95
24
+ DEFAULT_TOP_K = 40
25
+ DEFAULT_REPEAT_PENALTY = 1.1
26
+
27
+ # --- Global variable for the model ---
28
+ llm = None
29
+
30
+ # --- Model Download ---
31
+ def download_model_if_needed():
32
+ if not os.path.exists(LOCAL_MODEL_PATH):
33
+ print(f"Downloading {MODEL_FILENAME} from {MODEL_REPO_ID}...")
34
+ start_time = time.time()
35
+ try:
36
+ hf_hub_download(
37
+ repo_id=MODEL_REPO_ID,
38
+ filename=MODEL_FILENAME,
39
+ local_dir=".",
40
+ local_dir_use_symlinks=False, # Good practice for GGUF
41
+ resume_download=True
42
+ )
43
+ end_time = time.time()
44
+ print(f"Download complete in {end_time - start_time:.2f} seconds.")
45
+ return True
46
+ except Exception as e:
47
+ print(f"Error downloading model: {e}")
48
+ print("Please ensure MODEL_FILENAME is correct and available in the repository.")
49
+ print(f"Attempted to download: {MODEL_REPO_ID}/{MODEL_FILENAME}")
50
+ return False
51
+ else:
52
+ print(f"Model file {MODEL_FILENAME} already exists.")
53
+ return True
54
+ return False
55
+
56
+ # --- Model Loading ---
57
+ def load_llm_model():
58
+ global llm
59
+ if llm is None: # Load only if not already loaded
60
+ if not os.path.exists(LOCAL_MODEL_PATH):
61
+ print("Model file not found. Cannot load.")
62
+ return False
63
+ print("Loading Llama model...")
64
+ start_time = time.time()
65
+ try:
66
+ llm = Llama(
67
+ model_path=LOCAL_MODEL_PATH,
68
+ n_ctx=N_CTX,
69
+ n_threads=N_THREADS,
70
+ n_gpu_layers=N_GPU_LAYERS,
71
+ verbose=VERBOSE_LLAMA,
72
+ # logits_all=True, # Set to True if you need logits for all tokens (consumes more VRAM/RAM)
73
+ )
74
+ end_time = time.time()
75
+ print(f"Model loaded successfully in {end_time - start_time:.2f} seconds.")
76
+ return True
77
+ except Exception as e:
78
+ print(f"Error loading Llama model: {e}")
79
+ print("Ensure llama-cpp-python is installed correctly and the model file is valid.")
80
+ print(f"If you are on a resource-constrained environment (like free Hugging Face Spaces), "
81
+ f"the model ({MODEL_FILENAME}, ~{os.path.getsize(LOCAL_MODEL_PATH)/(1024**3):.2f}GB) might be too large.")
82
+ print("Try reducing N_CTX or using a smaller model variant if available.")
83
+ llm = None # Ensure llm is None if loading failed
84
+ return False
85
+ else:
86
+ print("Model already loaded.")
87
+ return True
88
+
89
 
90
  # --- Chat Function ---
91
+ def predict(message, history, system_prompt, max_new_tokens, temperature, top_p, top_k, repeat_penalty):
92
+ if llm is None:
93
+ return "Model not loaded. Please check the logs."
94
+
95
+ # Qwen specific chat format elements
96
+ im_start_token = "<|im_start|>"
97
+ im_end_token = "<|im_end|>"
98
+ # Common stop tokens for Qwen-like models
99
+ stop_tokens = [im_end_token, im_start_token + "user", im_start_token + "system", llm.token_eos()]
100
+
101
+
102
+ # Format messages for llama_cpp
103
+ messages = []
104
+ if system_prompt and system_prompt.strip():
105
+ messages.append({"role": "system", "content": system_prompt.strip()})
106
+
107
+ for human_msg, ai_msg in history:
108
+ messages.append({"role": "user", "content": human_msg})
109
+ if ai_msg is not None: # ai_msg could be None if it's the first turn and history is just the user prompt
110
+ messages.append({"role": "assistant", "content": ai_msg})
111
+ messages.append({"role": "user", "content": message})
112
+
113
+ print("\n--- Input to Model ---")
114
+ print(f"System Prompt: {system_prompt if system_prompt and system_prompt.strip() else 'None'}")
115
+ print(f"History: {history}")
116
+ print(f"Current Message: {message}")
117
+ print(f"Formatted messages for create_chat_completion: {messages}")
118
+ print("--- End Input to Model ---\n")
119
+
120
+ assistant_response_text = ""
121
+ generation_start_time = time.time()
122
+
123
  try:
124
+ print("Attempting generation with llm.create_chat_completion()...")
125
  response = llm.create_chat_completion(
126
+ messages=messages,
127
+ temperature=temperature,
128
+ top_p=top_p,
129
+ top_k=top_k,
130
+ repeat_penalty=repeat_penalty,
131
+ max_tokens=max_new_tokens,
132
+ stop=stop_tokens,
133
+ # stream=True # For streaming output, Gradio handles this differently
134
  )
135
+ assistant_response_text = response['choices'][0]['message']['content'].strip()
136
+ print(f"create_chat_completion successful. Raw response: {response['choices'][0]['message']}")
137
+
138
+ except Exception as e_chat_completion:
139
+ print(f"Error during create_chat_completion: {e_chat_completion}")
140
+ print("Falling back to manual prompt construction and llm()...")
141
+
142
+ # Construct prompt manually as a fallback (simplified Qwen format)
143
  prompt = ""
144
+ if system_prompt and system_prompt.strip():
145
+ prompt += f"{im_start_token}system\n{system_prompt.strip()}{im_end_token}\n"
146
+
147
+ for human_msg, ai_msg in history:
148
+ prompt += f"{im_start_token}user\n{human_msg}{im_end_token}\n"
149
+ if ai_msg is not None:
150
+ prompt += f"{im_start_token}assistant\n{ai_msg}{im_end_token}\n"
151
+ prompt += f"{im_start_token}user\n{message}{im_end_token}\n{im_start_token}assistant\n" # Model should continue from here
152
+
153
+ print(f"Fallback prompt: {prompt}")
154
 
155
  try:
156
  output = llm(
157
  prompt,
158
+ max_tokens=max_new_tokens,
159
+ temperature=temperature,
160
+ top_p=top_p,
161
+ top_k=top_k,
162
+ repeat_penalty=repeat_penalty,
163
+ stop=stop_tokens,
164
+ echo=False # Don't echo the input prompt
165
  )
166
+ assistant_response_text = output['choices'][0]['text'].strip()
167
+ print(f"Fallback llm() successful. Raw output: {output['choices'][0]['text']}")
168
  except Exception as e_fallback:
169
+ print(f"Error during fallback llm() generation: {e_fallback}")
170
+ assistant_response_text = "Sorry, I encountered an error during generation. Please check the logs."
 
171
 
172
+ generation_end_time = time.time()
173
+ print(f"Generated response: {assistant_response_text}")
174
+ print(f"Generation took {generation_end_time - generation_start_time:.2f} seconds.")
175
+ return assistant_response_text
176
 
177
  # --- Gradio Interface ---
178
+ def create_gradio_interface():
179
+ with gr.Blocks(theme=gr.themes.Soft()) as iface:
180
+ gr.Markdown(f"""
181
+ # Chat with {MODEL_REPO_ID.split('/')[-1]} ({MODEL_FILENAME})
182
+ This Space runs a GGUF quantized version of the model using `llama-cpp-python`.
183
+ Model: [{MODEL_REPO_ID}](https://huggingface.co/{MODEL_REPO_ID})
184
+ GGUF File: `{MODEL_FILENAME}` (Quantization: Q4_K_M)
185
+ """)
186
+
187
+ with gr.Row():
188
+ with gr.Column(scale=3):
189
+ chatbot = gr.Chatbot(
190
+ [],
191
+ elem_id="chatbot",
192
+ label="Chat Window",
193
+ bubble_full_width=False,
194
+ height=500,
195
+ )
196
+ user_input = gr.Textbox(
197
+ show_label=False,
198
+ placeholder="Type your message here and press Enter...",
199
+ container=False,
200
+ scale=7,
201
+ )
202
+
203
+ with gr.Column(scale=1):
204
+ gr.Markdown("### Model Parameters")
205
+ system_prompt_input = gr.Textbox(
206
+ label="System Prompt (Optional)",
207
+ placeholder="e.g., You are a helpful AI assistant.",
208
+ lines=3
209
+ )
210
+ max_new_tokens_slider = gr.Slider(
211
+ minimum=32, maximum=N_CTX, value=DEFAULT_MAX_NEW_TOKENS, step=32, # Max tokens cannot exceed context
212
+ label="Max New Tokens"
213
+ )
214
+ temperature_slider = gr.Slider(
215
+ minimum=0.0, maximum=2.0, value=DEFAULT_TEMPERATURE, step=0.05,
216
+ label="Temperature"
217
+ )
218
+ top_p_slider = gr.Slider(
219
+ minimum=0.0, maximum=1.0, value=DEFAULT_TOP_P, step=0.05,
220
+ label="Top-P (Nucleus Sampling)"
221
+ )
222
+ top_k_slider = gr.Slider(
223
+ minimum=0, maximum=100, value=DEFAULT_TOP_K, step=1,
224
+ label="Top-K Sampling"
225
+ )
226
+ repeat_penalty_slider = gr.Slider(
227
+ minimum=1.0, maximum=2.0, value=DEFAULT_REPEAT_PENALTY, step=0.05,
228
+ label="Repeat Penalty"
229
+ )
230
+ # Hidden status textbox for errors
231
+ status_display = gr.Textbox(label="Status", interactive=False, visible=False)
232
+
233
+
234
+ # Chat submission logic
235
+ def handle_submit(message, chat_history, sys_prompt, max_tokens, temp, top_p_val, top_k_val, rep_penalty):
236
+ if llm is None:
237
+ # Update status display if model not loaded
238
+ # This part is tricky as Gradio submit doesn't easily update arbitrary components outside its output
239
+ # For now, errors from predict will be returned in the chat.
240
+ # A more robust way would be a global status or specific UI element.
241
+ print("Attempted to chat but LLM is not loaded.")
242
+ # A simple way to indicate an issue if llm is None
243
+ chat_history.append((message, "ERROR: Model not loaded. Please check server logs."))
244
+ return "", chat_history, "ERROR: Model not loaded."
245
+
246
+ # Append user message
247
+ chat_history.append((message, None))
248
+ # We pass the full system prompt and params to predict
249
+ return "", chat_history, sys_prompt, max_tokens, temp, top_p_val, top_k_val, rep_penalty
250
+
251
+
252
+ # Connect user input to the generation
253
+ submit_args = {
254
+ "fn": predict,
255
+ "inputs": [user_input, chatbot, system_prompt_input, max_new_tokens_slider, temperature_slider, top_p_slider, top_k_slider, repeat_penalty_slider],
256
+ "outputs": [chatbot], # Predict will update the last AI message in chatbot
257
+ }
258
+
259
+ # Gradio's ChatInterface simplifies history management, but for custom layouts, we manage it manually.
260
+ # Here, we'll use a more direct approach like gr.Interface or manual updates.
261
+ # Since we use gr.Chatbot and manage history, we need to ensure `predict` gets the right state.
262
+ # `predict` directly takes history and returns the new AI response.
263
+ # Gradio's `gr.Chatbot` will automatically append the (user, ai_response) pair.
264
+
265
+ user_input.submit(
266
+ predict,
267
+ [user_input, chatbot, system_prompt_input, max_new_tokens_slider, temperature_slider, top_p_slider, top_k_slider, repeat_penalty_slider],
268
+ [user_input, chatbot], # Clear user_input, update chatbot
269
+ # The `predict` function returns only the assistant's response string.
270
+ # Gradio Chatbot expects the new AI message to be the output to update the last turn.
271
+ # So, we need a wrapper if we want to clear user_input and update chatbot
272
+ )
273
+
274
+ # A slightly cleaner way to handle chatbot updates with custom parameters
275
+ # and clearing input box:
276
+ def user_chat_fn(user_message, chat_history, sys_prompt, max_tok, temp, top_p_val, top_k_val, rep_pen):
277
+ if llm is None:
278
+ chat_history.append((user_message, "ERROR: Model not loaded. Check logs."))
279
+ return "", chat_history # Clear input, update history
280
+
281
+ # Append user message, AI response will be None initially
282
+ chat_history.append((user_message, None))
283
+ return "", chat_history, sys_prompt, max_tok, temp, top_p_val, top_k_val, rep_pen
284
+
285
+ def bot_response_fn(chat_history, sys_prompt, max_tok, temp, top_p_val, top_k_val, rep_pen):
286
+ if llm is None: # Should be caught by user_chat_fn, but double check
287
+ return chat_history # No change
288
+
289
+ # The last message in history is the user's current message
290
+ user_message = chat_history[-1][0]
291
+ # The history to pass to `predict` should not include the current user turn's empty AI response
292
+ history_for_predict = chat_history[:-1]
293
+
294
+ bot_msg = predict(user_message, history_for_predict, sys_prompt, max_tok, temp, top_p_val, top_k_val, rep_pen)
295
+ chat_history[-1] = (user_message, bot_msg) # Update the last turn with AI's response
296
+ return chat_history
297
+
298
+ # Chain the actions: user input -> update chatbot (user msg) -> bot generates -> update chatbot (bot msg)
299
+ user_input.submit(
300
+ user_chat_fn,
301
+ [user_input, chatbot, system_prompt_input, max_new_tokens_slider, temperature_slider, top_p_slider, top_k_slider, repeat_penalty_slider],
302
+ [user_input, chatbot, system_prompt_input, max_new_tokens_slider, temperature_slider, top_p_slider, top_k_slider, repeat_penalty_slider], # Outputs for user_chat_fn
303
+ queue=False # User input should be fast
304
+ ).then(
305
+ bot_response_fn,
306
+ [chatbot, system_prompt_input, max_new_tokens_slider, temperature_slider, top_p_slider, top_k_slider, repeat_penalty_slider],
307
+ [chatbot], # Output for bot_response_fn
308
+ queue=True # Generation can take time
309
+ )
310
+
311
+
312
+ gr.Examples(
313
+ examples=[
314
+ ["Hello, how are you today?", "You are a friendly and helpful AI assistant specializing in concise answers."],
315
+ ["What is the capital of France?", "Be very brief."],
316
+ ["Write a short poem about a robot learning to dream.", ""],
317
+ ["Explain the concept of black holes to a 5-year-old.", "Keep it simple and use an analogy."]
318
+ ],
319
+ inputs=[user_input, system_prompt_input],
320
+ # outputs=[chatbot], # Examples don't directly feed to chatbot output here with this setup
321
+ # fn=lambda q, s: (None, [(q, predict(q, [], s, ...default_params...))]) # Complex to run predict for examples
322
+ # For simplicity, examples just populate the input fields.
323
+ )
324
+
325
+ with gr.Accordion("Advanced/Debug Info", open=False):
326
+ gr.Markdown(f"""
327
+ - **Model File:** `{LOCAL_MODEL_PATH}`
328
+ - **N_CTX:** `{N_CTX}`
329
+ - **N_THREADS:** `{N_THREADS if N_THREADS is not None else 'Auto'}`
330
+ - **N_GPU_LAYERS:** `{N_GPU_LAYERS}`
331
+ - **Log Verbosity (llama.cpp):** `{VERBOSE_LLAMA}`
332
+ - **Stop Tokens Used:** `{im_start_token}system`, `{im_start_token}user`, `{im_end_token}`, `EOS_TOKEN`
333
+ """)
334
+ # Add a button to attempt model reload if it failed initially
335
+ reload_button = gr.Button("Attempt to Reload Model")
336
+ reload_status = gr.Label(value="Model Status: Unknown")
337
+
338
+ def update_reload_status():
339
+ if llm:
340
+ return "Model Status: Loaded Successfully"
341
+ else:
342
+ return "Model Status: Not Loaded (Check logs for errors)"
343
+
344
+ def attempt_reload():
345
+ global llm
346
+ llm = None # Force re-evaluation of loading
347
+ if load_llm_model():
348
+ return "Model reloaded successfully!"
349
+ else:
350
+ return "Model reload FAILED. Check server logs."
351
+
352
+ reload_button.click(attempt_reload, outputs=[reload_status])
353
+ iface.load(update_reload_status, outputs=[reload_status]) # Update status on interface load
354
+
355
+
356
+ return iface
357
+
358
+ # --- Main Execution ---
359
  if __name__ == "__main__":
360
+ print("Starting application...")
361
+ model_available = download_model_if_needed()
362
+
363
+ if model_available:
364
+ if not load_llm_model():
365
+ print("Model loading failed. The Gradio interface will start, but chat functionality will be impaired.")
366
+ print("You can try to reload the model via the 'Advanced/Debug Info' section in the UI.")
367
+ else:
368
+ print("Model ready.")
369
+ else:
370
+ print("Model download failed. Cannot proceed to load model or start chat functionality.")
371
+ print("The Gradio interface will start, but it will not be functional.")
372
+
373
+ print("Creating Gradio interface...")
374
+ app_interface = create_gradio_interface()
375
+
376
  print("Launching Gradio interface...")
377
+ # Share=True is useful for public links if running locally, but HF Spaces handles public URL.
378
+ # In_browser=True to open in browser locally.
379
+ app_interface.launch()
380
+ print("Gradio interface launched. Check your terminal or logs for the URL.")