Spaces:

robiro
/

k8o1

Running

App Files Files Community

robiro commited on 26 days ago

Commit

02d4edd

verified ·

1 Parent(s): feddae9

Update app.py

Browse files

Files changed (1) hide show

app.py +357 -96

app.py CHANGED Viewed

@@ -2,118 +2,379 @@ import gradio as gr
 from llama_cpp import Llama
 from huggingface_hub import hf_hub_download
 import os
 # --- Configuration ---
-MODEL_NAME_OR_PATH = "unsloth/DeepSeek-R1-0528-Qwen3-8B-GGUF"
-# Select a specific GGUF file. Check the "Files and versions" tab on Hugging Face
-# For this model, a common choice might be a Q4_K_M quant. Let's pick one.
-# Example: "DeepSeek-R1-0528-Qwen3-8B-Q4_K_M.gguf"
-# You MUST check the Hugging Face repo for the exact filename you want to use.
-# Let's assume this one exists for the example. Replace if needed.
-MODEL_FILE = "DeepSeek-R1-0528-Qwen3-8B-Q4_K_M.gguf" # MAKE SURE THIS FILENAME IS CORRECT on HF
-# Download the model file if it doesn't exist
-if not os.path.exists(MODEL_FILE):
-    print(f"Downloading {MODEL_FILE} from {MODEL_NAME_OR_PATH}...")
-    try:
-        hf_hub_download(
-            repo_id=MODEL_NAME_OR_PATH,
-            filename=MODEL_FILE,
-            local_dir=".", # Download to current directory
-            local_dir_use_symlinks=False # Good practice for GGUF
-        )
-        print("Download complete.")
-    except Exception as e:
-        print(f"Error downloading model: {e}")
-        print("Please ensure the MODEL_FILE name is correct and available in the repository.")
-        exit()
-else:
-    print(f"Model file {MODEL_FILE} already exists.")
-# --- Load the GGUF Model ---
-# Adjust n_gpu_layers if you have a GPU-enabled llama-cpp-python
-# -1 means all possible layers to GPU, 0 means CPU only.
-try:
-    print("Loading model...")
-    llm = Llama(
-        model_path=MODEL_FILE,
-        n_ctx=2048,        # Context window size
-        n_threads=None,    # None for llama.cpp to auto-detect, or set a specific number
-        n_gpu_layers=0     # Change to -1 or a positive number if you have GPU support
-                           # and want to offload layers to GPU.
-    )
-    print("Model loaded successfully.")
-except Exception as e:
-    print(f"Error loading Llama model: {e}")
-    print("Ensure llama-cpp-python is installed correctly and the model file is valid.")
-    exit()
 # --- Chat Function ---
-def predict(message, history):
-    history_llama_format = []
-    for human, ai in history:
-        history_llama_format.append({"role": "user", "content": human})
-        history_llama_format.append({"role": "assistant", "content": ai})
-    history_llama_format.append({"role": "user", "content": message})
-    # Qwen models often use a specific chat template.
-    # We need to format the prompt correctly for the model.
-    # llama-cpp-python's create_chat_completion can handle this if the model
-    # has chat template info embedded, or you might need to construct it manually.
-    # For simpler generation:
-    # prompt = f"<|im_start|>user\n{message}<|im_end|>\n<|im_start|>assistant\n"
-    # Using create_chat_completion for a more robust approach if model supports it
     try:
         response = llm.create_chat_completion(
-            messages=history_llama_format,
-            # temperature=0.7, # Example: Adjust for creativity
-            # top_p=0.9,       # Example: Nucleus sampling
-            # max_tokens=256   # Max tokens to generate for the response
         )
-        assistant_response = response['choices'][0]['message']['content']
-    except Exception as e:
-        print(f"Error during model inference: {e}")
-        assistant_response = "Sorry, I encountered an error."
-        # Fallback to simpler generation if create_chat_completion fails or is not well-supported for this GGUF
-        # This is a very basic prompt construction, might need adjustment based on Qwen's specific format
         prompt = ""
-        for entry in history_llama_format:
-            if entry["role"] == "user":
-                prompt += f"<|im_start|>user\n{entry['content']}<|im_end|>\n"
-            elif entry["role"] == "assistant":
-                prompt += f"<|im_start|>assistant\n{entry['content']}<|im_end|>\n"
-        prompt += "<|im_start|>assistant\n" # Start of assistant's turn
         try:
             output = llm(
                 prompt,
-                max_tokens=256,
-                stop=["<|im_end|>", "<|im_start|>user"], # Stop generation at these tokens
-                echo=False # Don't echo the prompt
             )
-            assistant_response = output['choices'][0]['text'].strip()
         except Exception as e_fallback:
-            print(f"Error during fallback model inference: {e_fallback}")
-            assistant_response = "Sorry, I encountered an error during fallback."
-    return assistant_response
 # --- Gradio Interface ---
-iface = gr.ChatInterface(
-    fn=predict,
-    title="Unsloth DeepSeek-Qwen3-8B GGUF Chat",
-    description="Chat with the unsloth/DeepSeek-R1-0528-Qwen3-8B-GGUF model.",
-    examples=[
-        ["Hello, how are you?"],
-        ["What is the capital of France?"],
-        ["Write a short story about a friendly robot."]
-    ],
-    chatbot=gr.Chatbot(height=600)
-)
-# --- Launch the App ---
 if __name__ == "__main__":
     print("Launching Gradio interface...")
-    iface.launch()

 from llama_cpp import Llama
 from huggingface_hub import hf_hub_download
 import os
+import time
 # --- Configuration ---
+MODEL_REPO_ID = "unsloth/DeepSeek-R1-0528-Qwen3-8B-GGUF"
+# IMPORTANT: Verify this filename exists in the "Files and versions" tab of the repo
+MODEL_FILENAME = "DeepSeek-R1-0528-Qwen3-8B-Q4_K_M.gguf"
+LOCAL_MODEL_PATH = f"./{MODEL_FILENAME}" # Download to current directory
+# LLM Llama Parameters (adjust based on your Space's resources)
+N_CTX = 2048          # Context window size. Default 2048. Max for this model is very large, but needs RAM.
+N_THREADS = None      # Number of CPU threads to use. None = Llama.cpp auto-detects.
+                      # On smaller CPU Spaces (e.g., 2-4 cores), explicitly setting N_THREADS=2 or N_THREADS=4 might be beneficial.
+N_GPU_LAYERS = 0      # Number of layers to offload to GPU. 0 for CPU-only. -1 for all possible.
+VERBOSE_LLAMA = True  # Enable verbose logging from llama.cpp
+# Generation parameters
+DEFAULT_MAX_NEW_TOKENS = 512
+DEFAULT_TEMPERATURE = 0.7
+DEFAULT_TOP_P = 0.95
+DEFAULT_TOP_K = 40
+DEFAULT_REPEAT_PENALTY = 1.1
+# --- Global variable for the model ---
+llm = None
+# --- Model Download ---
+def download_model_if_needed():
+    if not os.path.exists(LOCAL_MODEL_PATH):
+        print(f"Downloading {MODEL_FILENAME} from {MODEL_REPO_ID}...")
+        start_time = time.time()
+        try:
+            hf_hub_download(
+                repo_id=MODEL_REPO_ID,
+                filename=MODEL_FILENAME,
+                local_dir=".",
+                local_dir_use_symlinks=False, # Good practice for GGUF
+                resume_download=True
+            )
+            end_time = time.time()
+            print(f"Download complete in {end_time - start_time:.2f} seconds.")
+            return True
+        except Exception as e:
+            print(f"Error downloading model: {e}")
+            print("Please ensure MODEL_FILENAME is correct and available in the repository.")
+            print(f"Attempted to download: {MODEL_REPO_ID}/{MODEL_FILENAME}")
+            return False
+    else:
+        print(f"Model file {MODEL_FILENAME} already exists.")
+        return True
+    return False
+# --- Model Loading ---
+def load_llm_model():
+    global llm
+    if llm is None: # Load only if not already loaded
+        if not os.path.exists(LOCAL_MODEL_PATH):
+            print("Model file not found. Cannot load.")
+            return False
+        print("Loading Llama model...")
+        start_time = time.time()
+        try:
+            llm = Llama(
+                model_path=LOCAL_MODEL_PATH,
+                n_ctx=N_CTX,
+                n_threads=N_THREADS,
+                n_gpu_layers=N_GPU_LAYERS,
+                verbose=VERBOSE_LLAMA,
+                # logits_all=True, # Set to True if you need logits for all tokens (consumes more VRAM/RAM)
+            )
+            end_time = time.time()
+            print(f"Model loaded successfully in {end_time - start_time:.2f} seconds.")
+            return True
+        except Exception as e:
+            print(f"Error loading Llama model: {e}")
+            print("Ensure llama-cpp-python is installed correctly and the model file is valid.")
+            print(f"If you are on a resource-constrained environment (like free Hugging Face Spaces), "
+                  f"the model ({MODEL_FILENAME}, ~{os.path.getsize(LOCAL_MODEL_PATH)/(1024**3):.2f}GB) might be too large.")
+            print("Try reducing N_CTX or using a smaller model variant if available.")
+            llm = None # Ensure llm is None if loading failed
+            return False
+    else:
+        print("Model already loaded.")
+        return True
 # --- Chat Function ---
+def predict(message, history, system_prompt, max_new_tokens, temperature, top_p, top_k, repeat_penalty):
+    if llm is None:
+        return "Model not loaded. Please check the logs."
+    # Qwen specific chat format elements
+    im_start_token = "<|im_start|>"
+    im_end_token = "<|im_end|>"
+    # Common stop tokens for Qwen-like models
+    stop_tokens = [im_end_token, im_start_token + "user", im_start_token + "system", llm.token_eos()]
+    # Format messages for llama_cpp
+    messages = []
+    if system_prompt and system_prompt.strip():
+        messages.append({"role": "system", "content": system_prompt.strip()})
+    for human_msg, ai_msg in history:
+        messages.append({"role": "user", "content": human_msg})
+        if ai_msg is not None: # ai_msg could be None if it's the first turn and history is just the user prompt
+            messages.append({"role": "assistant", "content": ai_msg})
+    messages.append({"role": "user", "content": message})
+    print("\n--- Input to Model ---")
+    print(f"System Prompt: {system_prompt if system_prompt and system_prompt.strip() else 'None'}")
+    print(f"History: {history}")
+    print(f"Current Message: {message}")
+    print(f"Formatted messages for create_chat_completion: {messages}")
+    print("--- End Input to Model ---\n")
+    assistant_response_text = ""
+    generation_start_time = time.time()
     try:
+        print("Attempting generation with llm.create_chat_completion()...")
         response = llm.create_chat_completion(
+            messages=messages,
+            temperature=temperature,
+            top_p=top_p,
+            top_k=top_k,
+            repeat_penalty=repeat_penalty,
+            max_tokens=max_new_tokens,
+            stop=stop_tokens,
+            # stream=True # For streaming output, Gradio handles this differently
         )
+        assistant_response_text = response['choices'][0]['message']['content'].strip()
+        print(f"create_chat_completion successful. Raw response: {response['choices'][0]['message']}")
+    except Exception as e_chat_completion:
+        print(f"Error during create_chat_completion: {e_chat_completion}")
+        print("Falling back to manual prompt construction and llm()...")
+        # Construct prompt manually as a fallback (simplified Qwen format)
         prompt = ""
+        if system_prompt and system_prompt.strip():
+            prompt += f"{im_start_token}system\n{system_prompt.strip()}{im_end_token}\n"
+        for human_msg, ai_msg in history:
+            prompt += f"{im_start_token}user\n{human_msg}{im_end_token}\n"
+            if ai_msg is not None:
+                 prompt += f"{im_start_token}assistant\n{ai_msg}{im_end_token}\n"
+        prompt += f"{im_start_token}user\n{message}{im_end_token}\n{im_start_token}assistant\n" # Model should continue from here
+        print(f"Fallback prompt: {prompt}")
         try:
             output = llm(
                 prompt,
+                max_tokens=max_new_tokens,
+                temperature=temperature,
+                top_p=top_p,
+                top_k=top_k,
+                repeat_penalty=repeat_penalty,
+                stop=stop_tokens,
+                echo=False # Don't echo the input prompt
             )
+            assistant_response_text = output['choices'][0]['text'].strip()
+            print(f"Fallback llm() successful. Raw output: {output['choices'][0]['text']}")
         except Exception as e_fallback:
+            print(f"Error during fallback llm() generation: {e_fallback}")
+            assistant_response_text = "Sorry, I encountered an error during generation. Please check the logs."
+    generation_end_time = time.time()
+    print(f"Generated response: {assistant_response_text}")
+    print(f"Generation took {generation_end_time - generation_start_time:.2f} seconds.")
+    return assistant_response_text
 # --- Gradio Interface ---
+def create_gradio_interface():
+    with gr.Blocks(theme=gr.themes.Soft()) as iface:
+        gr.Markdown(f"""
+        # Chat with {MODEL_REPO_ID.split('/')[-1]} ({MODEL_FILENAME})
+        This Space runs a GGUF quantized version of the model using `llama-cpp-python`.
+        Model: [{MODEL_REPO_ID}](https://huggingface.co/{MODEL_REPO_ID})
+        GGUF File: `{MODEL_FILENAME}` (Quantization: Q4_K_M)
+        """)
+        with gr.Row():
+            with gr.Column(scale=3):
+                chatbot = gr.Chatbot(
+                    [],
+                    elem_id="chatbot",
+                    label="Chat Window",
+                    bubble_full_width=False,
+                    height=500,
+                )
+                user_input = gr.Textbox(
+                    show_label=False,
+                    placeholder="Type your message here and press Enter...",
+                    container=False,
+                    scale=7,
+                )
+            with gr.Column(scale=1):
+                gr.Markdown("### Model Parameters")
+                system_prompt_input = gr.Textbox(
+                    label="System Prompt (Optional)",
+                    placeholder="e.g., You are a helpful AI assistant.",
+                    lines=3
+                )
+                max_new_tokens_slider = gr.Slider(
+                    minimum=32, maximum=N_CTX, value=DEFAULT_MAX_NEW_TOKENS, step=32, # Max tokens cannot exceed context
+                    label="Max New Tokens"
+                )
+                temperature_slider = gr.Slider(
+                    minimum=0.0, maximum=2.0, value=DEFAULT_TEMPERATURE, step=0.05,
+                    label="Temperature"
+                )
+                top_p_slider = gr.Slider(
+                    minimum=0.0, maximum=1.0, value=DEFAULT_TOP_P, step=0.05,
+                    label="Top-P (Nucleus Sampling)"
+                )
+                top_k_slider = gr.Slider(
+                    minimum=0, maximum=100, value=DEFAULT_TOP_K, step=1,
+                    label="Top-K Sampling"
+                )
+                repeat_penalty_slider = gr.Slider(
+                    minimum=1.0, maximum=2.0, value=DEFAULT_REPEAT_PENALTY, step=0.05,
+                    label="Repeat Penalty"
+                )
+                # Hidden status textbox for errors
+                status_display = gr.Textbox(label="Status", interactive=False, visible=False)
+        # Chat submission logic
+        def handle_submit(message, chat_history, sys_prompt, max_tokens, temp, top_p_val, top_k_val, rep_penalty):
+            if llm is None:
+                # Update status display if model not loaded
+                # This part is tricky as Gradio submit doesn't easily update arbitrary components outside its output
+                # For now, errors from predict will be returned in the chat.
+                # A more robust way would be a global status or specific UI element.
+                print("Attempted to chat but LLM is not loaded.")
+                # A simple way to indicate an issue if llm is None
+                chat_history.append((message, "ERROR: Model not loaded. Please check server logs."))
+                return "", chat_history, "ERROR: Model not loaded."
+            # Append user message
+            chat_history.append((message, None))
+            # We pass the full system prompt and params to predict
+            return "", chat_history, sys_prompt, max_tokens, temp, top_p_val, top_k_val, rep_penalty
+        # Connect user input to the generation
+        submit_args = {
+            "fn": predict,
+            "inputs": [user_input, chatbot, system_prompt_input, max_new_tokens_slider, temperature_slider, top_p_slider, top_k_slider, repeat_penalty_slider],
+            "outputs": [chatbot], # Predict will update the last AI message in chatbot
+        }
+        # Gradio's ChatInterface simplifies history management, but for custom layouts, we manage it manually.
+        # Here, we'll use a more direct approach like gr.Interface or manual updates.
+        # Since we use gr.Chatbot and manage history, we need to ensure `predict` gets the right state.
+        # `predict` directly takes history and returns the new AI response.
+        # Gradio's `gr.Chatbot` will automatically append the (user, ai_response) pair.
+        user_input.submit(
+            predict,
+            [user_input, chatbot, system_prompt_input, max_new_tokens_slider, temperature_slider, top_p_slider, top_k_slider, repeat_penalty_slider],
+            [user_input, chatbot], # Clear user_input, update chatbot
+            # The `predict` function returns only the assistant's response string.
+            # Gradio Chatbot expects the new AI message to be the output to update the last turn.
+            # So, we need a wrapper if we want to clear user_input and update chatbot
+        )
+        # A slightly cleaner way to handle chatbot updates with custom parameters
+        # and clearing input box:
+        def user_chat_fn(user_message, chat_history, sys_prompt, max_tok, temp, top_p_val, top_k_val, rep_pen):
+            if llm is None:
+                chat_history.append((user_message, "ERROR: Model not loaded. Check logs."))
+                return "", chat_history # Clear input, update history
+            # Append user message, AI response will be None initially
+            chat_history.append((user_message, None))
+            return "", chat_history, sys_prompt, max_tok, temp, top_p_val, top_k_val, rep_pen
+        def bot_response_fn(chat_history, sys_prompt, max_tok, temp, top_p_val, top_k_val, rep_pen):
+            if llm is None: # Should be caught by user_chat_fn, but double check
+                return chat_history # No change
+            # The last message in history is the user's current message
+            user_message = chat_history[-1][0]
+            # The history to pass to `predict` should not include the current user turn's empty AI response
+            history_for_predict = chat_history[:-1]
+            bot_msg = predict(user_message, history_for_predict, sys_prompt, max_tok, temp, top_p_val, top_k_val, rep_pen)
+            chat_history[-1] = (user_message, bot_msg) # Update the last turn with AI's response
+            return chat_history
+        # Chain the actions: user input -> update chatbot (user msg) -> bot generates -> update chatbot (bot msg)
+        user_input.submit(
+            user_chat_fn,
+            [user_input, chatbot, system_prompt_input, max_new_tokens_slider, temperature_slider, top_p_slider, top_k_slider, repeat_penalty_slider],
+            [user_input, chatbot, system_prompt_input, max_new_tokens_slider, temperature_slider, top_p_slider, top_k_slider, repeat_penalty_slider], # Outputs for user_chat_fn
+            queue=False # User input should be fast
+        ).then(
+            bot_response_fn,
+            [chatbot, system_prompt_input, max_new_tokens_slider, temperature_slider, top_p_slider, top_k_slider, repeat_penalty_slider],
+            [chatbot], # Output for bot_response_fn
+            queue=True # Generation can take time
+        )
+        gr.Examples(
+            examples=[
+                ["Hello, how are you today?", "You are a friendly and helpful AI assistant specializing in concise answers."],
+                ["What is the capital of France?", "Be very brief."],
+                ["Write a short poem about a robot learning to dream.", ""],
+                ["Explain the concept of black holes to a 5-year-old.", "Keep it simple and use an analogy."]
+            ],
+            inputs=[user_input, system_prompt_input],
+            # outputs=[chatbot], # Examples don't directly feed to chatbot output here with this setup
+            # fn=lambda q, s: (None, [(q, predict(q, [], s, ...default_params...))]) # Complex to run predict for examples
+            # For simplicity, examples just populate the input fields.
+        )
+        with gr.Accordion("Advanced/Debug Info", open=False):
+            gr.Markdown(f"""
+            - **Model File:** `{LOCAL_MODEL_PATH}`
+            - **N_CTX:** `{N_CTX}`
+            - **N_THREADS:** `{N_THREADS if N_THREADS is not None else 'Auto'}`
+            - **N_GPU_LAYERS:** `{N_GPU_LAYERS}`
+            - **Log Verbosity (llama.cpp):** `{VERBOSE_LLAMA}`
+            - **Stop Tokens Used:** `{im_start_token}system`, `{im_start_token}user`, `{im_end_token}`, `EOS_TOKEN`
+            """)
+            # Add a button to attempt model reload if it failed initially
+            reload_button = gr.Button("Attempt to Reload Model")
+            reload_status = gr.Label(value="Model Status: Unknown")
+            def update_reload_status():
+                if llm:
+                    return "Model Status: Loaded Successfully"
+                else:
+                    return "Model Status: Not Loaded (Check logs for errors)"
+            def attempt_reload():
+                global llm
+                llm = None # Force re-evaluation of loading
+                if load_llm_model():
+                    return "Model reloaded successfully!"
+                else:
+                    return "Model reload FAILED. Check server logs."
+            reload_button.click(attempt_reload, outputs=[reload_status])
+            iface.load(update_reload_status, outputs=[reload_status]) # Update status on interface load
+    return iface
+# --- Main Execution ---
 if __name__ == "__main__":
+    print("Starting application...")
+    model_available = download_model_if_needed()
+    if model_available:
+        if not load_llm_model():
+            print("Model loading failed. The Gradio interface will start, but chat functionality will be impaired.")
+            print("You can try to reload the model via the 'Advanced/Debug Info' section in the UI.")
+        else:
+            print("Model ready.")
+    else:
+        print("Model download failed. Cannot proceed to load model or start chat functionality.")
+        print("The Gradio interface will start, but it will not be functional.")
+    print("Creating Gradio interface...")
+    app_interface = create_gradio_interface()
     print("Launching Gradio interface...")
+    # Share=True is useful for public links if running locally, but HF Spaces handles public URL.
+    # In_browser=True to open in browser locally.
+    app_interface.launch()
+    print("Gradio interface launched. Check your terminal or logs for the URL.")