import gradio as gr from huggingface_hub import hf_hub_download from llama_cpp import Llama # Download the model from Hugging Face Hub print("===== Downloading model... =====") model_path = hf_hub_download( repo_id="bartowski/Dolphin3.0-Llama3.2-3B-GGUF", filename="Dolphin3.0-Llama3.2-3B-Q4_K_M.gguf" ) print(f"Model downloaded to: {model_path}") # Load the model with llama-cpp-python print("===== Loading model... =====") llm = Llama( model_path=model_path, n_ctx=2048, # adjust as per RAM n_threads=8 # adjust based on your Space CPU (8 is good default) ) print("Model loaded.") # Chat function with Dolphin 3.0 template def chat(user_input): print(f"User input: {user_input}") full_prompt = f"""### System: You are Dolphin 3.0, a helpful and friendly AI assistant. ### User: {user_input} ### Assistant:""" output = llm( full_prompt, max_tokens=512, stop=["", "### User:", "### Assistant:"] ) reply = output['choices'][0]['text'].strip() print(f"Model reply: {reply}") return reply # Gradio UI iface = gr.Interface( fn=chat, inputs="text", outputs="text", title="🐬 Dolphin 3.0 - Llama 3.2 3B GGUF Chat", description="Running Dolphin 3.0 Llama 3.2 3B GGUF model using llama-cpp-python on Hugging Face Space" ) iface.launch()