import gradio as gr
from huggingface_hub import hf_hub_download
from llama_cpp import Llama

# Download the model from Hugging Face Hub
print("===== Downloading model... =====")
model_path = hf_hub_download(
    repo_id="bartowski/Dolphin3.0-Llama3.2-3B-GGUF",
    filename="Dolphin3.0-Llama3.2-3B-Q4_K_M.gguf"
)
print(f"Model downloaded to: {model_path}")

# Load the model with llama-cpp-python
print("===== Loading model... =====")
llm = Llama(
    model_path=model_path,
    n_ctx=2048,          # adjust as per RAM
    n_threads=8          # adjust based on your Space CPU (8 is good default)
)
print("Model loaded.")

# Chat function with Dolphin 3.0 template
def chat(user_input):
    print(f"User input: {user_input}")
    full_prompt = f"""### System:
You are Dolphin 3.0, a helpful and friendly AI assistant.

### User:
{user_input}

### Assistant:"""
    
    output = llm(
        full_prompt,
        max_tokens=512,
        stop=["</s>", "### User:", "### Assistant:"]
    )
    
    reply = output['choices'][0]['text'].strip()
    print(f"Model reply: {reply}")
    return reply

# Gradio UI
iface = gr.Interface(
    fn=chat,
    inputs="text",
    outputs="text",
    title="🐬 Dolphin 3.0 - Llama 3.2 3B GGUF Chat",
    description="Running Dolphin 3.0 Llama 3.2 3B GGUF model using llama-cpp-python on Hugging Face Space"
)

iface.launch()