import os
import gradio as gr
from huggingface_hub import hf_hub_download
os.system("pip install llama-cpp-python")
from llama_cpp import Llama

# --- Configuration and Model Loading ---
MODEL_REPO = "unsloth/gemma-3-270m-it-GGUF"
QUANTIZED_FILENAME = "gemma-3-270m-it-Q4_K_M.gguf"

bot_im = "https://huggingface.co/spaces/idzkha/Geo-Chat-Bert/resolve/main/bot.png"
user_im = "https://huggingface.co/spaces/idzkha/Geo-Chat-Bert/resolve/main/user.png"

try:
    print(f"Downloading model: {QUANTIZED_FILENAME} from {MODEL_REPO}...")
    model_path = hf_hub_download(
        repo_id=MODEL_REPO,
        filename=QUANTIZED_FILENAME
    )

    print("Loading GGUF model for CPU inference...")
    llm = Llama(
        model_path=model_path,
        n_ctx=8192,
        n_gpu_layers=0, # Ensures CPU-only operation
        verbose=False
    )
    print("Model loaded successfully.")

except Exception as e:
    print(f"Error loading model: {e}")
    exit()

# --- Gradio UI and Logic ---

# This is the new, combined function that handles the entire chat process.
def generate_chat_stream(user_message, history, system_prompt, max_new_tokens, temperature, top_p):
    """
    A single generator function to handle streaming chat responses.
    """
    # Add the user's message to the history, with an empty string as a placeholder for the bot's response.
    history.append([user_message, ""])

    # Build the prompt for the model
    full_prompt = ""
    if system_prompt and system_prompt.strip():
        full_prompt += f"System: {system_prompt}\n"

    # Pass the history *before* the current turn to the model
    for user_msg, model_msg in history[:-1]:
        full_prompt += f"User: {user_msg}\n"
        if model_msg is not None:
            full_prompt += f"Assistant: {model_msg}\n"
    
    full_prompt += f"User: {user_message}\nAssistant: "

    # Generate a streaming response from the model
    stream = llm(
        prompt=full_prompt,
        max_tokens=max_new_tokens,
        temperature=float(temperature),
        top_p=float(top_p),
        stream=True
    )

    # Yield the history object at each step to update the UI
    for output in stream:
        # Append the new token to the bot's message placeholder
        history[-1][1] += output['choices'][0]['text']
        # Yield the updated history and clear the user's textbox
        yield history, ""

# Build the Gradio interface
with gr.Blocks(theme=gr.themes.Soft()) as demo:
    gr.Markdown(f"""# Gradio Chat Demo (CPU Optimized with GGUF)
    ### Model: `{MODEL_REPO}` | Quantization: `{QUANTIZED_FILENAME}`""")
    
    chatbot = gr.Chatbot(label="Chat History", height=500, avatar_images=(user_im, bot_im))
    
    msg = gr.Textbox(
        label="Your Message",
        placeholder="Type your message here and press Enter...",
    )
        
    with gr.Accordion("Model Parameters", open=False):
        system_prompt = gr.Textbox(label="System Prompt", value="You are a helpful assistant.")
        max_new_tokens = gr.Slider(minimum=1, maximum=4096, value=1024, step=1, label="Max New Tokens")
        temperature = gr.Slider(minimum=0.1, maximum=1.0, value=0.7, step=0.05, label="Temperature")
        top_p = gr.Slider(minimum=0.1, maximum=1.0, value=0.95, step=0.05, label="Top-p (Nucleus Sampling)")

    clear = gr.Button("Clear Chat History")

    # Connect the new generator function to the submit event
    msg.submit(
        generate_chat_stream,
        [msg, chatbot, system_prompt, max_new_tokens, temperature, top_p],
        [chatbot, msg]
    )

    clear.click(lambda: [], None, chatbot, queue=False)

# Launch the demo
demo.queue().launch(debug=True)