import os
import gradio as gr
from transformers import AutoModelForCausalLM, AutoTokenizer
import torch

# Get the HF token from environment
hf_token = os.getenv("HUGGINGFACE_TOKEN")

# Your fine-tuned model
model_id = "alphaoumardev/Llama3-8B-noryu-instruct"

# Authenticate with token when loading tokenizer/model
tokenizer = AutoTokenizer.from_pretrained(model_id, use_auth_token=hf_token)
model = AutoModelForCausalLM.from_pretrained(model_id, use_auth_token=hf_token)
model.eval()

# Device setup
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
model.to(device)

def chat(user_input, history=[]):
    history.append({"role": "user", "content": user_input})

    # Format the prompt
    prompt = ""
    for turn in history:
        role = turn["role"]
        content = turn["content"]
        prompt += f"{role}: {content}\n"
    prompt += "assistant:"

    # Tokenize and generate
    inputs = tokenizer(prompt, return_tensors="pt").to(device)
    with torch.no_grad():
        outputs = model.generate(
            **inputs,
            max_new_tokens=200,
            do_sample=True,
            temperature=0.7,
            top_p=0.9,
            pad_token_id=tokenizer.eos_token_id
        )

    output_text = tokenizer.decode(outputs[0], skip_special_tokens=True)
    assistant_reply = output_text.split("assistant:")[-1].strip()
    history.append({"role": "assistant", "content": assistant_reply})

    # Gradio expects tuple list format for Chatbot display
    chat_history = [(h["content"], history[i + 1]["content"]) for i, h in enumerate(history[:-1]) if h["role"] == "user"]
    return chat_history, history

# Gradio Blocks UI
with gr.Blocks() as demo:
    chatbot = gr.Chatbot()
    state = gr.State([])  # memory of the conversation
    txt = gr.Textbox(show_label=False, placeholder="Type your message...")

    txt.submit(chat, [txt, state], [chatbot, state])

demo.launch()