from llama_cpp import Llama
from huggingface_hub import hf_hub_download
import gradio as gr

# Automatically download model to local path
model_path = hf_hub_download(
    repo_id="TheBloke/OpenHermes-2.5-Mistral-7B-GGUF",
    filename="openhermes-2.5-mistral-7b.Q4_K_M.gguf"
)

# Load the model
llm = Llama(model_path=model_path, n_ctx=2048)


def chatbot_response(message, history):
    # Format history for chat template
    formatted_history = ""
    for user, bot in history[-4:]:  # Last 4 turns
        formatted_history += f"<|user|>\n{user}<|end|>\n<|assistant|>\n{bot}<|end|>\n"
    prompt = formatted_history + f"<|user|>\n{message}<|end|>\n<|assistant|>\n"

    try:
        output = llm(prompt, max_tokens=256, stop=["<|end|>"], temperature=0.7)
        text = output["choices"][0]["text"].strip()
        return text
    except Exception as e:
        return f"Error: {e}"

demo = gr.ChatInterface(
    fn=chatbot_response,
    title="🧠 Free CPU Chatbot (OpenHermes-2.5)",
    description="A lightweight, high-quality chatbot that runs on CPU using llama.cpp",
    theme="soft",
    examples=["What's your name?", "Tell me a joke", "What is Python used for?"]
)

if __name__ == "__main__":
    demo.launch(share=True)