import copy import time import html from openai import OpenAI import gradio as gr stop_generation = False def stream_from_vllm(messages, thinking_enabled=True, temperature=1.0): global stop_generation client = OpenAI() response = client.chat.completions.create( model="GLM-4.5", messages=messages, temperature=temperature, stream=True, max_tokens=65536, extra_body={ "thinking": { "type": "enabled" if thinking_enabled else "disabled", } } ) for chunk in response: if stop_generation: break if chunk.choices and chunk.choices[0].delta: yield chunk.choices[0].delta class GLM45Model: def __init__(self): self.accumulated_content = "" self.accumulated_reasoning = "" def reset_state(self): self.accumulated_content = "" self.accumulated_reasoning = "" def _render_response(self, reasoning_content, regular_content, skip_think=False): html_parts = [] if reasoning_content and not skip_think: reasoning_escaped = html.escape(reasoning_content).replace("\n", "
") think_html = ( "

Thinking

" "

" + reasoning_escaped + "

" ) html_parts.append(think_html) if regular_content: content_escaped = html.escape(regular_content).replace("\n", "
") content_html = f"

{content_escaped}

" html_parts.append(content_html) return "".join(html_parts) def _build_messages(self, raw_hist, sys_prompt): msgs = [] if sys_prompt.strip(): msgs.append({"role": "system", "content": sys_prompt.strip()}) for h in raw_hist: if h["role"] == "user": msgs.append({"role": "user", "content": h["content"]}) else: msg = {"role": "assistant", "content": h.get("content", "")} if h.get("reasoning_content"): msg["reasoning_content"] = h.get("reasoning_content") msgs.append(msg) return msgs def stream_generate(self, raw_hist, sys_prompt, thinking_enabled=True, temperature=1.0): global stop_generation stop_generation = False msgs = self._build_messages(raw_hist, sys_prompt) self.reset_state() try: for delta in stream_from_vllm(msgs, thinking_enabled, temperature): if stop_generation: break if hasattr(delta, 'content') and delta.content: self.accumulated_content += delta.content if hasattr(delta, 'reasoning_content') and delta.reasoning_content: self.accumulated_reasoning += delta.reasoning_content yield self._render_response(self.accumulated_reasoning, self.accumulated_content, not thinking_enabled) except Exception as e: yield self._render_response("", f"Error: {str(e)}") glm45 = GLM45Model() def chat(msg, raw_hist, sys_prompt, thinking_enabled, temperature): global stop_generation stop_generation = False if not msg.strip(): return raw_hist, copy.deepcopy(raw_hist), "" if raw_hist is None: raw_hist = [] raw_hist.append({"role": "user", "content": msg.strip()}) place = { "role": "assistant", "content": "", "reasoning_content": "" } raw_hist.append(place) yield raw_hist, copy.deepcopy(raw_hist), "" try: for chunk in glm45.stream_generate(raw_hist[:-1], sys_prompt, thinking_enabled, temperature): if stop_generation: break place["content"] = glm45.accumulated_content place["reasoning_content"] = glm45.accumulated_reasoning place["display_content"] = chunk yield raw_hist, copy.deepcopy(raw_hist), "" except Exception as e: place["content"] = f"Error: {str(e)}" place["display_content"] = f"

Error: {html.escape(str(e))}

" yield raw_hist, copy.deepcopy(raw_hist), "" def reset(): global stop_generation stop_generation = True time.sleep(0.1) return [], [], "" def format_history_for_display(raw_hist): display_hist = [] for msg in raw_hist: if msg["role"] == "user": display_hist.append({"role": "user", "content": msg["content"]}) else: content = msg.get("display_content", msg.get("content", "")) display_hist.append({"role": "assistant", "content": content}) return display_hist demo = gr.Blocks(title="GLM-4.5 API Demo", theme=gr.themes.Soft()) with demo: gr.HTML( "

GLM-4.5 API Demo

" "

" "This demo uses the API version of the service for faster response speeds.
" "Only chat functionality with 64K token length is supported. For tool usage, MCP support, and web search, please refer to the API documentation.

" "

Model | " "Github | " "Blog | " "API Docs

" ) raw_history = gr.State([]) with gr.Row(): with gr.Column(scale=7): chatbox = gr.Chatbot( label="Chat", type="messages", height=600, elem_classes="chatbot-container", sanitize_html=False, line_breaks=True ) textbox = gr.Textbox(label="Message", lines=3) with gr.Row(): send = gr.Button("Send", variant="primary") clear = gr.Button("Clear") with gr.Column(scale=1): thinking_toggle = gr.Checkbox(label="Enable Thinking", value=True) gr.HTML( "

" "Enabled: Activates the model's thinking capability. The model will decide whether to think based on the situation and may return empty thinking content.
" "Disabled: Disables the model's thinking capability. The model will answer questions directly without reasoning." "

" ) temperature_slider = gr.Slider( minimum=0.0, maximum=1.0, value=1.0, step=0.01, label="Temperature" ) sys = gr.Textbox(label="System Prompt", lines=6) def chat_wrapper(msg, raw_hist, sys_prompt, thinking_enabled, temperature): for hist, raw_hist_updated, textbox_value in chat(msg, raw_hist, sys_prompt, thinking_enabled, temperature): display_hist = format_history_for_display(hist) yield display_hist, raw_hist_updated, textbox_value send.click( chat_wrapper, inputs=[textbox, raw_history, sys, thinking_toggle, temperature_slider], outputs=[chatbox, raw_history, textbox] ) textbox.submit( chat_wrapper, inputs=[textbox, raw_history, sys, thinking_toggle, temperature_slider], outputs=[chatbox, raw_history, textbox] ) clear.click( reset, outputs=[chatbox, raw_history, textbox] ) if __name__ == "__main__": demo.queue(max_size=None, default_concurrency_limit=None) demo.launch()