import os, re, logging, gradio as gr from openai import OpenAI from gateway import request_generation from utils import LATEX_DELIMS openai_api_key = os.getenv("API_KEY") openai_api_base = os.getenv("API_ENDPOINT") MODEL = os.getenv("MODEL_NAME", "") client = OpenAI(api_key=openai_api_key, base_url=openai_api_base) MAX_NEW_TOKENS = int(os.getenv("MAX_NEW_TOKENS", 1024)) CONCURRENCY_LIMIT = int(os.getenv("CONCURRENCY_LIMIT", 20)) QUEUE_SIZE = int(os.getenv("QUEUE_SIZE", CONCURRENCY_LIMIT * 4)) logging.basicConfig(level=logging.INFO) def format_analysis_response(text): m = re.search(r"analysis(.*?)assistantfinal", text, re.DOTALL) if m: reasoning = m.group(1).strip() response = text.split("assistantfinal", 1)[-1].strip() return ( f"**🤔 Analysis:**\n\n*{reasoning}*\n\n---\n\n" f"**💬 Response:**\n\n{response}" ) return text.strip() def generate(message, history, system_prompt, temperature, frequency_penalty, presence_penalty, max_new_tokens): if not message.strip(): yield "Please enter a prompt." return msgs = [] for h in history: if isinstance(h, dict): msgs.append(h) elif isinstance(h, (list, tuple)) and len(h) == 2: u, a = h if u: msgs.append({"role": "user", "content": u}) if a: msgs.append({"role": "assistant", "content": a}) logging.info(f"[User] {message}") logging.info(f"[System] {system_prompt} | Temp={temperature}") collected, buffer = "", "" yielded_once = False try: for delta in request_generation( api_key=openai_api_key, api_base=openai_api_base, message=message, system_prompt=system_prompt, model_name=MODEL, chat_history=msgs, temperature=temperature, frequency_penalty=frequency_penalty, presence_penalty=presence_penalty, max_new_tokens=max_new_tokens, ): if not delta: continue collected += delta buffer += delta if not yielded_once: yield delta buffer = "" yielded_once = True continue if "\n" in buffer or len(buffer) > 150: yield collected buffer = "" final = format_analysis_response(collected) if final.count("$") % 2: final += "$" yield final except Exception as e: logging.exception("Stream failed") yield f"❌ Error: {e}" chatbot_ui = gr.ChatInterface( fn=generate, type="messages", chatbot=gr.Chatbot( label="OSS vLLM Chatbot", type="messages", scale=2, height=600, latex_delimiters=LATEX_DELIMS, ), stop_btn=True, additional_inputs=[ gr.Textbox(label="System prompt", value="You are a helpful assistant.", lines=2), gr.Slider(label="Temperature", minimum=0.0, maximum=1.0, step=0.1, value=0.7), ], examples=[ ["Explain the difference between supervised and unsupervised learning."], ["Summarize the plot of Inception in two sentences."], ["Show me the LaTeX for the quadratic formula."], ["What are advantages of AMD Instinct MI300X GPU?"], ["Derive the gradient of softmax cross-entropy loss."], ["Explain why ∂/∂x xⁿ = n·xⁿ⁻¹ holds."], ], # title="Open-source GPT-OSS-120B on AMD MI300X", title=" GPT-OSS-120B on AMD MI300X", description="This Space is an Alpha release that demonstrates gpt-oss-120b model running on AMD MI300 infrastructure. The space is built with Apache 2.0 License.", ) if __name__ == "__main__": chatbot_ui.queue(max_size=QUEUE_SIZE, default_concurrency_limit=CONCURRENCY_LIMIT).launch()