Spaces:

zai-org
/

GLM-4.5-Space

Running

File size: 8,206 Bytes

67199da
 
 
 
 
 
 
 
 
 
 
 
 
cfdacf5
67199da
 
 
ecbee24
67199da
325c020
 
 
67199da
 
 
 
 
 
325c020
67199da
 
 
9ec8fec
325c020
 
9ec8fec
 
325c020
 
9ec8fec
325c020
9ec8fec
 
325c020
 
67199da
bba4030
67199da
325c020
9ec8fec
67199da
9ec8fec
67199da
9ec8fec
325c020
 
9ec8fec
67199da
9ec8fec
67199da
 
 
 
bba4030
325c020
67199da
 
bba4030
67199da
bba4030
 
 
 
67199da
 
325c020
67199da
 
 
9ec8fec
67199da
 
 
 
 
 
9ec8fec
325c020
 
 
 
9ec8fec
325c020
67199da
 
325c020
67199da
 
 
 
 
 
 
 
 
 
 
 
 
 
325c020
 
bba4030
 
 
 
 
67199da
 
 
 
 
 
 
 
bba4030
 
 
 
 
67199da
 
bba4030
 
67199da
 
 
 
 
 
 
 
 
 
bba4030
 
 
 
 
 
 
 
 
 
 
cfdacf5
67199da
 
26295fb
cfdacf5
67199da
325c020
bba4030
 
cfdacf5
aa0c384
325c020
67199da
325c020
67199da
 
 
445847a
67199da
 
 
 
 
 
 
 
 
 
 
 
 
 
26295fb
67199da
325c020
 
67199da
 
 
 
 
 
 
 
 
445847a
67199da
bba4030
 
 
 
 
 
 
67199da
bba4030
67199da
 
 
 
bba4030
67199da
 
 
 
 
 
 
 
 
103cf37
445847a

import copy
import time
import html
from openai import OpenAI
import gradio as gr

stop_generation = False


def stream_from_vllm(messages, thinking_enabled=True, temperature=1.0):
    global stop_generation
    client = OpenAI()
    response = client.chat.completions.create(
        model="GLM-4.5",
        messages=messages,
        temperature=temperature,
        stream=True,
        max_tokens=65536,
        extra_body={
            "thinking": {
                "type": "enabled" if thinking_enabled else "disabled",
            }
        }
    )
    for chunk in response:
        if stop_generation:
            break
        if chunk.choices and chunk.choices[0].delta:
            yield chunk.choices[0].delta


class GLM45Model:
    def __init__(self):
        self.accumulated_content = ""
        self.accumulated_reasoning = ""

    def reset_state(self):
        self.accumulated_content = ""
        self.accumulated_reasoning = ""

    def _render_response(self, reasoning_content, regular_content, skip_think=False):
        html_parts = []

        if reasoning_content and not skip_think:
            reasoning_escaped = html.escape(reasoning_content).replace("\n", "<br>")
            think_html = (
                    "<details open><summary style='cursor:pointer;font-weight:bold;color:#007acc;'>Thinking</summary>"
                    "<div style='color:#555555;line-height:1.6;padding:15px;border-left:4px solid #007acc;margin:10px 0;background-color:#f0f7ff;border-radius:4px;'>"
                    + reasoning_escaped +
                    "</div></details>"
            )
            html_parts.append(think_html)

        if regular_content:
            content_escaped = html.escape(regular_content).replace("\n", "<br>")
            content_html = f"<div style='margin:0.5em 0; white-space: pre-wrap; line-height:1.6;'>{content_escaped}</div>"
            html_parts.append(content_html)

        return "".join(html_parts)

    def _build_messages(self, raw_hist, sys_prompt):
        msgs = []
        if sys_prompt.strip():
            msgs.append({"role": "system", "content": sys_prompt.strip()})

        for h in raw_hist:
            if h["role"] == "user":
                msgs.append({"role": "user", "content": h["content"]})
            else:
                msg = {"role": "assistant", "content": h.get("content", "")}
                if h.get("reasoning_content"):
                    msg["reasoning_content"] = h.get("reasoning_content")
                msgs.append(msg)
        return msgs

    def stream_generate(self, raw_hist, sys_prompt, thinking_enabled=True, temperature=1.0):
        global stop_generation
        stop_generation = False
        msgs = self._build_messages(raw_hist, sys_prompt)
        self.reset_state()

        try:
            for delta in stream_from_vllm(msgs, thinking_enabled, temperature):
                if stop_generation:
                    break

                if hasattr(delta, 'content') and delta.content:
                    self.accumulated_content += delta.content

                if hasattr(delta, 'reasoning_content') and delta.reasoning_content:
                    self.accumulated_reasoning += delta.reasoning_content

                yield self._render_response(self.accumulated_reasoning, self.accumulated_content, not thinking_enabled)

        except Exception as e:
            yield self._render_response("", f"Error: {str(e)}")


glm45 = GLM45Model()


def chat(msg, raw_hist, sys_prompt, thinking_enabled, temperature):
    global stop_generation
    stop_generation = False

    if not msg.strip():
        return raw_hist, copy.deepcopy(raw_hist), ""

    if raw_hist is None:
        raw_hist = []

    raw_hist.append({"role": "user", "content": msg.strip()})
    place = {
        "role": "assistant",
        "content": "",
        "reasoning_content": ""
    }
    raw_hist.append(place)

    yield raw_hist, copy.deepcopy(raw_hist), ""

    try:
        for chunk in glm45.stream_generate(raw_hist[:-1], sys_prompt, thinking_enabled, temperature):
            if stop_generation:
                break

            place["content"] = glm45.accumulated_content
            place["reasoning_content"] = glm45.accumulated_reasoning
            place["display_content"] = chunk

            yield raw_hist, copy.deepcopy(raw_hist), ""
    except Exception as e:
        place["content"] = f"Error: {str(e)}"
        place["display_content"] = f"<div style='color: red;'>Error: {html.escape(str(e))}</div>"
        yield raw_hist, copy.deepcopy(raw_hist), ""


def reset():
    global stop_generation
    stop_generation = True
    time.sleep(0.1)
    return [], [], ""


def format_history_for_display(raw_hist):
    display_hist = []
    for msg in raw_hist:
        if msg["role"] == "user":
            display_hist.append({"role": "user", "content": msg["content"]})
        else:
            content = msg.get("display_content", msg.get("content", ""))
            display_hist.append({"role": "assistant", "content": content})
    return display_hist


demo = gr.Blocks(title="GLM-4.5 API Demo", theme=gr.themes.Soft())

with demo:
    gr.HTML(
        "<div style='text-align:center;font-size:32px;font-weight:bold;margin-bottom:10px;'>GLM-4.5 API Demo</div>"
        "<div style='text-align:center;color:red;font-size:16px;margin-bottom:20px;'>"
        "This demo uses the API version of the service for faster response speeds.<br>"
        "Only chat functionality with 64K token length is supported. For tool usage, MCP support, and web search, please refer to the API documentation.</div>"
        "<div style='text-align:center;'><a href='https://modelscope.cn/collections/GLM-45-b8693e2a08984f'>Model</a> | "
        "<a href='https://github.com/zai-org/GLM-4.5'>Github</a> | "
        "<a href='http://z.ai/blog/glm-4.5'>Blog</a> | "
        "<a href='https://docs.bigmodel.cn/cn/guide/models/text/glm-4.5'>API Docs</a></div>"
    )

    raw_history = gr.State([])

    with gr.Row():
        with gr.Column(scale=7):
            chatbox = gr.Chatbot(
                label="Chat",
                type="messages",
                height=600,
                elem_classes="chatbot-container",
                sanitize_html=False,
                line_breaks=True
            )
            textbox = gr.Textbox(label="Message", lines=3)
            with gr.Row():
                send = gr.Button("Send", variant="primary")
                clear = gr.Button("Clear")
        with gr.Column(scale=1):
            thinking_toggle = gr.Checkbox(label="Enable Thinking", value=True)
            gr.HTML(
                "<div style='color:red;font-size:12px;margin-top:5px;margin-bottom:15px;'>"
                "Enabled: Activates the model's thinking capability. The model will decide whether to think based on the situation and may return empty thinking content.<br>"
                "Disabled: Disables the model's thinking capability. The model will answer questions directly without reasoning."
                "</div>"
            )
            temperature_slider = gr.Slider(
                minimum=0.0,
                maximum=1.0,
                value=1.0,
                step=0.01,
                label="Temperature"
            )
            sys = gr.Textbox(label="System Prompt", lines=6)


    def chat_wrapper(msg, raw_hist, sys_prompt, thinking_enabled, temperature):
        for hist, raw_hist_updated, textbox_value in chat(msg, raw_hist, sys_prompt, thinking_enabled, temperature):
            display_hist = format_history_for_display(hist)
            yield display_hist, raw_hist_updated, textbox_value


    send.click(
        chat_wrapper,
        inputs=[textbox, raw_history, sys, thinking_toggle, temperature_slider],
        outputs=[chatbox, raw_history, textbox]
    )
    textbox.submit(
        chat_wrapper,
        inputs=[textbox, raw_history, sys, thinking_toggle, temperature_slider],
        outputs=[chatbox, raw_history, textbox]
    )
    clear.click(
        reset,
        outputs=[chatbox, raw_history, textbox]
    )

if __name__ == "__main__":
    demo.queue(max_size=None, default_concurrency_limit=None)
    demo.launch()