GLM-4.5-Space / app.py
ZAHNGYUXUAN's picture
Add concurrency_limit (#1)
103cf37 verified
import copy
import time
import html
from openai import OpenAI
import gradio as gr
stop_generation = False
def stream_from_vllm(messages, thinking_enabled=True, temperature=1.0):
global stop_generation
client = OpenAI()
response = client.chat.completions.create(
model="GLM-4.5",
messages=messages,
temperature=temperature,
stream=True,
max_tokens=65536,
extra_body={
"thinking": {
"type": "enabled" if thinking_enabled else "disabled",
}
}
)
for chunk in response:
if stop_generation:
break
if chunk.choices and chunk.choices[0].delta:
yield chunk.choices[0].delta
class GLM45Model:
def __init__(self):
self.accumulated_content = ""
self.accumulated_reasoning = ""
def reset_state(self):
self.accumulated_content = ""
self.accumulated_reasoning = ""
def _render_response(self, reasoning_content, regular_content, skip_think=False):
html_parts = []
if reasoning_content and not skip_think:
reasoning_escaped = html.escape(reasoning_content).replace("\n", "<br>")
think_html = (
"<details open><summary style='cursor:pointer;font-weight:bold;color:#007acc;'>Thinking</summary>"
"<div style='color:#555555;line-height:1.6;padding:15px;border-left:4px solid #007acc;margin:10px 0;background-color:#f0f7ff;border-radius:4px;'>"
+ reasoning_escaped +
"</div></details>"
)
html_parts.append(think_html)
if regular_content:
content_escaped = html.escape(regular_content).replace("\n", "<br>")
content_html = f"<div style='margin:0.5em 0; white-space: pre-wrap; line-height:1.6;'>{content_escaped}</div>"
html_parts.append(content_html)
return "".join(html_parts)
def _build_messages(self, raw_hist, sys_prompt):
msgs = []
if sys_prompt.strip():
msgs.append({"role": "system", "content": sys_prompt.strip()})
for h in raw_hist:
if h["role"] == "user":
msgs.append({"role": "user", "content": h["content"]})
else:
msg = {"role": "assistant", "content": h.get("content", "")}
if h.get("reasoning_content"):
msg["reasoning_content"] = h.get("reasoning_content")
msgs.append(msg)
return msgs
def stream_generate(self, raw_hist, sys_prompt, thinking_enabled=True, temperature=1.0):
global stop_generation
stop_generation = False
msgs = self._build_messages(raw_hist, sys_prompt)
self.reset_state()
try:
for delta in stream_from_vllm(msgs, thinking_enabled, temperature):
if stop_generation:
break
if hasattr(delta, 'content') and delta.content:
self.accumulated_content += delta.content
if hasattr(delta, 'reasoning_content') and delta.reasoning_content:
self.accumulated_reasoning += delta.reasoning_content
yield self._render_response(self.accumulated_reasoning, self.accumulated_content, not thinking_enabled)
except Exception as e:
yield self._render_response("", f"Error: {str(e)}")
glm45 = GLM45Model()
def chat(msg, raw_hist, sys_prompt, thinking_enabled, temperature):
global stop_generation
stop_generation = False
if not msg.strip():
return raw_hist, copy.deepcopy(raw_hist), ""
if raw_hist is None:
raw_hist = []
raw_hist.append({"role": "user", "content": msg.strip()})
place = {
"role": "assistant",
"content": "",
"reasoning_content": ""
}
raw_hist.append(place)
yield raw_hist, copy.deepcopy(raw_hist), ""
try:
for chunk in glm45.stream_generate(raw_hist[:-1], sys_prompt, thinking_enabled, temperature):
if stop_generation:
break
place["content"] = glm45.accumulated_content
place["reasoning_content"] = glm45.accumulated_reasoning
place["display_content"] = chunk
yield raw_hist, copy.deepcopy(raw_hist), ""
except Exception as e:
place["content"] = f"Error: {str(e)}"
place["display_content"] = f"<div style='color: red;'>Error: {html.escape(str(e))}</div>"
yield raw_hist, copy.deepcopy(raw_hist), ""
def reset():
global stop_generation
stop_generation = True
time.sleep(0.1)
return [], [], ""
def format_history_for_display(raw_hist):
display_hist = []
for msg in raw_hist:
if msg["role"] == "user":
display_hist.append({"role": "user", "content": msg["content"]})
else:
content = msg.get("display_content", msg.get("content", ""))
display_hist.append({"role": "assistant", "content": content})
return display_hist
demo = gr.Blocks(title="GLM-4.5 API Demo", theme=gr.themes.Soft())
with demo:
gr.HTML(
"<div style='text-align:center;font-size:32px;font-weight:bold;margin-bottom:10px;'>GLM-4.5 API Demo</div>"
"<div style='text-align:center;color:red;font-size:16px;margin-bottom:20px;'>"
"This demo uses the API version of the service for faster response speeds.<br>"
"Only chat functionality with 64K token length is supported. For tool usage, MCP support, and web search, please refer to the API documentation.</div>"
"<div style='text-align:center;'><a href='https://modelscope.cn/collections/GLM-45-b8693e2a08984f'>Model</a> | "
"<a href='https://github.com/zai-org/GLM-4.5'>Github</a> | "
"<a href='http://z.ai/blog/glm-4.5'>Blog</a> | "
"<a href='https://docs.bigmodel.cn/cn/guide/models/text/glm-4.5'>API Docs</a></div>"
)
raw_history = gr.State([])
with gr.Row():
with gr.Column(scale=7):
chatbox = gr.Chatbot(
label="Chat",
type="messages",
height=600,
elem_classes="chatbot-container",
sanitize_html=False,
line_breaks=True
)
textbox = gr.Textbox(label="Message", lines=3)
with gr.Row():
send = gr.Button("Send", variant="primary")
clear = gr.Button("Clear")
with gr.Column(scale=1):
thinking_toggle = gr.Checkbox(label="Enable Thinking", value=True)
gr.HTML(
"<div style='color:red;font-size:12px;margin-top:5px;margin-bottom:15px;'>"
"Enabled: Activates the model's thinking capability. The model will decide whether to think based on the situation and may return empty thinking content.<br>"
"Disabled: Disables the model's thinking capability. The model will answer questions directly without reasoning."
"</div>"
)
temperature_slider = gr.Slider(
minimum=0.0,
maximum=1.0,
value=1.0,
step=0.01,
label="Temperature"
)
sys = gr.Textbox(label="System Prompt", lines=6)
def chat_wrapper(msg, raw_hist, sys_prompt, thinking_enabled, temperature):
for hist, raw_hist_updated, textbox_value in chat(msg, raw_hist, sys_prompt, thinking_enabled, temperature):
display_hist = format_history_for_display(hist)
yield display_hist, raw_hist_updated, textbox_value
send.click(
chat_wrapper,
inputs=[textbox, raw_history, sys, thinking_toggle, temperature_slider],
outputs=[chatbox, raw_history, textbox]
)
textbox.submit(
chat_wrapper,
inputs=[textbox, raw_history, sys, thinking_toggle, temperature_slider],
outputs=[chatbox, raw_history, textbox]
)
clear.click(
reset,
outputs=[chatbox, raw_history, textbox]
)
if __name__ == "__main__":
demo.queue(max_size=None, default_concurrency_limit=None)
demo.launch()