import os, re, logging, gradio as gr
from openai import OpenAI
from gateway import request_generation
from utils import LATEX_DELIMS
    
openai_api_key = os.getenv("API_KEY")
openai_api_base = os.getenv("API_ENDPOINT")
MODEL = os.getenv("MODEL_NAME", "")
client = OpenAI(api_key=openai_api_key, base_url=openai_api_base)
MAX_NEW_TOKENS = int(os.getenv("MAX_NEW_TOKENS", 1024))
CONCURRENCY_LIMIT = int(os.getenv("CONCURRENCY_LIMIT", 20))
QUEUE_SIZE = int(os.getenv("QUEUE_SIZE", CONCURRENCY_LIMIT * 4))

logging.basicConfig(level=logging.INFO)

def format_analysis_response(text):
    m = re.search(r"analysis(.*?)assistantfinal", text, re.DOTALL)
    if m:
        reasoning = m.group(1).strip()
        response = text.split("assistantfinal", 1)[-1].strip()
        return (
            f"**🤔 Analysis:**\n\n*{reasoning}*\n\n---\n\n"
            f"**💬 Response:**\n\n{response}"
        )
    return text.strip()

def generate(message, history,
             system_prompt, temperature,
             frequency_penalty, presence_penalty,
             max_new_tokens):

    if not message.strip():
        yield "Please enter a prompt."
        return

    msgs = []
    for h in history:
        if isinstance(h, dict):
            msgs.append(h)
        elif isinstance(h, (list, tuple)) and len(h) == 2:
            u, a = h
            if u: msgs.append({"role": "user", "content": u})
            if a: msgs.append({"role": "assistant", "content": a})

    logging.info(f"[User] {message}")
    logging.info(f"[System] {system_prompt} | Temp={temperature}")

    collected, buffer = "", ""
    yielded_once = False

    try:
        for delta in request_generation(
            api_key=openai_api_key, api_base=openai_api_base,
            message=message, system_prompt=system_prompt,
            model_name=MODEL, chat_history=msgs,
            temperature=temperature,
            frequency_penalty=frequency_penalty,
            presence_penalty=presence_penalty,
            max_new_tokens=max_new_tokens,
        ):
            if not delta:
                continue

            collected += delta
            buffer += delta

            if not yielded_once:
                yield delta
                buffer = ""
                yielded_once = True
                continue

            if "\n" in buffer or len(buffer) > 150:
                yield collected
                buffer = ""

        final = format_analysis_response(collected)
        if final.count("$") % 2:
            final += "$"
        yield final

    except Exception as e:
        logging.exception("Stream failed")
        yield f"❌ Error: {e}"

chatbot_ui = gr.ChatInterface(
    fn=generate,
    type="messages",
    chatbot=gr.Chatbot(
        label="OSS vLLM Chatbot",
        type="messages",
        scale=2,
        height=600,
        latex_delimiters=LATEX_DELIMS,
    ),
    stop_btn=True,
    additional_inputs=[
        gr.Textbox(label="System prompt", value="You are a helpful assistant.", lines=2),
        gr.Slider(label="Temperature", minimum=0.0, maximum=1.0, step=0.1, value=0.7),
    ],
    examples=[
        ["Explain the difference between supervised and unsupervised learning."],
        ["Summarize the plot of Inception in two sentences."],
        ["Show me the LaTeX for the quadratic formula."],
        ["What are advantages of AMD Instinct MI300X GPU?"],
        ["Derive the gradient of softmax cross-entropy loss."],
        ["Explain why ∂/∂x xⁿ = n·xⁿ⁻¹ holds."],
    ],
    # title="Open-source GPT-OSS-120B on AMD MI300X",
    title=" GPT-OSS-120B on AMD MI300X",
    description="This Space is an Alpha release that demonstrates gpt-oss-120b model running on AMD MI300 infrastructure. The space is built with Apache 2.0 License.",
)
if __name__ == "__main__":
    chatbot_ui.queue(max_size=QUEUE_SIZE,
                     default_concurrency_limit=CONCURRENCY_LIMIT).launch()