Spaces:
Running
Running
import copy | |
import time | |
import html | |
from openai import OpenAI | |
import gradio as gr | |
stop_generation = False | |
def stream_from_vllm(messages, thinking_enabled=True, temperature=1.0): | |
global stop_generation | |
client = OpenAI() | |
response = client.chat.completions.create( | |
model="GLM-4.5", | |
messages=messages, | |
temperature=temperature, | |
stream=True, | |
max_tokens=65536, | |
extra_body={ | |
"thinking": { | |
"type": "enabled" if thinking_enabled else "disabled", | |
} | |
} | |
) | |
for chunk in response: | |
if stop_generation: | |
break | |
if chunk.choices and chunk.choices[0].delta: | |
yield chunk.choices[0].delta | |
class GLM45Model: | |
def __init__(self): | |
self.accumulated_content = "" | |
self.accumulated_reasoning = "" | |
def reset_state(self): | |
self.accumulated_content = "" | |
self.accumulated_reasoning = "" | |
def _render_response(self, reasoning_content, regular_content, skip_think=False): | |
html_parts = [] | |
if reasoning_content and not skip_think: | |
reasoning_escaped = html.escape(reasoning_content).replace("\n", "<br>") | |
think_html = ( | |
"<details open><summary style='cursor:pointer;font-weight:bold;color:#007acc;'>Thinking</summary>" | |
"<div style='color:#555555;line-height:1.6;padding:15px;border-left:4px solid #007acc;margin:10px 0;background-color:#f0f7ff;border-radius:4px;'>" | |
+ reasoning_escaped + | |
"</div></details>" | |
) | |
html_parts.append(think_html) | |
if regular_content: | |
content_escaped = html.escape(regular_content).replace("\n", "<br>") | |
content_html = f"<div style='margin:0.5em 0; white-space: pre-wrap; line-height:1.6;'>{content_escaped}</div>" | |
html_parts.append(content_html) | |
return "".join(html_parts) | |
def _build_messages(self, raw_hist, sys_prompt): | |
msgs = [] | |
if sys_prompt.strip(): | |
msgs.append({"role": "system", "content": sys_prompt.strip()}) | |
for h in raw_hist: | |
if h["role"] == "user": | |
msgs.append({"role": "user", "content": h["content"]}) | |
else: | |
msg = {"role": "assistant", "content": h.get("content", "")} | |
if h.get("reasoning_content"): | |
msg["reasoning_content"] = h.get("reasoning_content") | |
msgs.append(msg) | |
return msgs | |
def stream_generate(self, raw_hist, sys_prompt, thinking_enabled=True, temperature=1.0): | |
global stop_generation | |
stop_generation = False | |
msgs = self._build_messages(raw_hist, sys_prompt) | |
self.reset_state() | |
try: | |
for delta in stream_from_vllm(msgs, thinking_enabled, temperature): | |
if stop_generation: | |
break | |
if hasattr(delta, 'content') and delta.content: | |
self.accumulated_content += delta.content | |
if hasattr(delta, 'reasoning_content') and delta.reasoning_content: | |
self.accumulated_reasoning += delta.reasoning_content | |
yield self._render_response(self.accumulated_reasoning, self.accumulated_content, not thinking_enabled) | |
except Exception as e: | |
yield self._render_response("", f"Error: {str(e)}") | |
glm45 = GLM45Model() | |
def chat(msg, raw_hist, sys_prompt, thinking_enabled, temperature): | |
global stop_generation | |
stop_generation = False | |
if not msg.strip(): | |
return raw_hist, copy.deepcopy(raw_hist), "" | |
if raw_hist is None: | |
raw_hist = [] | |
raw_hist.append({"role": "user", "content": msg.strip()}) | |
place = { | |
"role": "assistant", | |
"content": "", | |
"reasoning_content": "" | |
} | |
raw_hist.append(place) | |
yield raw_hist, copy.deepcopy(raw_hist), "" | |
try: | |
for chunk in glm45.stream_generate(raw_hist[:-1], sys_prompt, thinking_enabled, temperature): | |
if stop_generation: | |
break | |
place["content"] = glm45.accumulated_content | |
place["reasoning_content"] = glm45.accumulated_reasoning | |
place["display_content"] = chunk | |
yield raw_hist, copy.deepcopy(raw_hist), "" | |
except Exception as e: | |
place["content"] = f"Error: {str(e)}" | |
place["display_content"] = f"<div style='color: red;'>Error: {html.escape(str(e))}</div>" | |
yield raw_hist, copy.deepcopy(raw_hist), "" | |
def reset(): | |
global stop_generation | |
stop_generation = True | |
time.sleep(0.1) | |
return [], [], "" | |
def format_history_for_display(raw_hist): | |
display_hist = [] | |
for msg in raw_hist: | |
if msg["role"] == "user": | |
display_hist.append({"role": "user", "content": msg["content"]}) | |
else: | |
content = msg.get("display_content", msg.get("content", "")) | |
display_hist.append({"role": "assistant", "content": content}) | |
return display_hist | |
demo = gr.Blocks(title="GLM-4.5 API Demo", theme=gr.themes.Soft()) | |
with demo: | |
gr.HTML( | |
"<div style='text-align:center;font-size:32px;font-weight:bold;margin-bottom:10px;'>GLM-4.5 API Demo</div>" | |
"<div style='text-align:center;color:red;font-size:16px;margin-bottom:20px;'>" | |
"This demo uses the API version of the service for faster response speeds.<br>" | |
"Only chat functionality with 64K token length is supported. For tool usage, MCP support, and web search, please refer to the API documentation.</div>" | |
"<div style='text-align:center;'><a href='https://modelscope.cn/collections/GLM-45-b8693e2a08984f'>Model</a> | " | |
"<a href='https://github.com/zai-org/GLM-4.5'>Github</a> | " | |
"<a href='http://z.ai/blog/glm-4.5'>Blog</a> | " | |
"<a href='https://docs.bigmodel.cn/cn/guide/models/text/glm-4.5'>API Docs</a></div>" | |
) | |
raw_history = gr.State([]) | |
with gr.Row(): | |
with gr.Column(scale=7): | |
chatbox = gr.Chatbot( | |
label="Chat", | |
type="messages", | |
height=600, | |
elem_classes="chatbot-container", | |
sanitize_html=False, | |
line_breaks=True | |
) | |
textbox = gr.Textbox(label="Message", lines=3) | |
with gr.Row(): | |
send = gr.Button("Send", variant="primary") | |
clear = gr.Button("Clear") | |
with gr.Column(scale=1): | |
thinking_toggle = gr.Checkbox(label="Enable Thinking", value=True) | |
gr.HTML( | |
"<div style='color:red;font-size:12px;margin-top:5px;margin-bottom:15px;'>" | |
"Enabled: Activates the model's thinking capability. The model will decide whether to think based on the situation and may return empty thinking content.<br>" | |
"Disabled: Disables the model's thinking capability. The model will answer questions directly without reasoning." | |
"</div>" | |
) | |
temperature_slider = gr.Slider( | |
minimum=0.0, | |
maximum=1.0, | |
value=1.0, | |
step=0.01, | |
label="Temperature" | |
) | |
sys = gr.Textbox(label="System Prompt", lines=6) | |
def chat_wrapper(msg, raw_hist, sys_prompt, thinking_enabled, temperature): | |
for hist, raw_hist_updated, textbox_value in chat(msg, raw_hist, sys_prompt, thinking_enabled, temperature): | |
display_hist = format_history_for_display(hist) | |
yield display_hist, raw_hist_updated, textbox_value | |
send.click( | |
chat_wrapper, | |
inputs=[textbox, raw_history, sys, thinking_toggle, temperature_slider], | |
outputs=[chatbox, raw_history, textbox] | |
) | |
textbox.submit( | |
chat_wrapper, | |
inputs=[textbox, raw_history, sys, thinking_toggle, temperature_slider], | |
outputs=[chatbox, raw_history, textbox] | |
) | |
clear.click( | |
reset, | |
outputs=[chatbox, raw_history, textbox] | |
) | |
if __name__ == "__main__": | |
demo.queue(max_size=None, default_concurrency_limit=None) | |
demo.launch() |