File size: 8,206 Bytes
67199da
 
 
 
 
 
 
 
 
 
 
 
 
cfdacf5
67199da
 
 
ecbee24
67199da
325c020
 
 
67199da
 
 
 
 
 
325c020
67199da
 
 
9ec8fec
325c020
 
9ec8fec
 
325c020
 
9ec8fec
325c020
9ec8fec
 
325c020
 
67199da
bba4030
67199da
325c020
9ec8fec
67199da
9ec8fec
67199da
9ec8fec
325c020
 
9ec8fec
67199da
9ec8fec
67199da
 
 
 
bba4030
325c020
67199da
 
bba4030
67199da
bba4030
 
 
 
67199da
 
325c020
67199da
 
 
9ec8fec
67199da
 
 
 
 
 
9ec8fec
325c020
 
 
 
9ec8fec
325c020
67199da
 
325c020
67199da
 
 
 
 
 
 
 
 
 
 
 
 
 
325c020
 
bba4030
 
 
 
 
67199da
 
 
 
 
 
 
 
bba4030
 
 
 
 
67199da
 
bba4030
 
67199da
 
 
 
 
 
 
 
 
 
bba4030
 
 
 
 
 
 
 
 
 
 
cfdacf5
67199da
 
26295fb
cfdacf5
67199da
325c020
bba4030
 
cfdacf5
aa0c384
325c020
67199da
325c020
67199da
 
 
445847a
67199da
 
 
 
 
 
 
 
 
 
 
 
 
 
26295fb
67199da
325c020
 
67199da
 
 
 
 
 
 
 
 
445847a
67199da
bba4030
 
 
 
 
 
 
67199da
bba4030
67199da
 
 
 
bba4030
67199da
 
 
 
 
 
 
 
 
103cf37
445847a
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
import copy
import time
import html
from openai import OpenAI
import gradio as gr

stop_generation = False


def stream_from_vllm(messages, thinking_enabled=True, temperature=1.0):
    global stop_generation
    client = OpenAI()
    response = client.chat.completions.create(
        model="GLM-4.5",
        messages=messages,
        temperature=temperature,
        stream=True,
        max_tokens=65536,
        extra_body={
            "thinking": {
                "type": "enabled" if thinking_enabled else "disabled",
            }
        }
    )
    for chunk in response:
        if stop_generation:
            break
        if chunk.choices and chunk.choices[0].delta:
            yield chunk.choices[0].delta


class GLM45Model:
    def __init__(self):
        self.accumulated_content = ""
        self.accumulated_reasoning = ""

    def reset_state(self):
        self.accumulated_content = ""
        self.accumulated_reasoning = ""

    def _render_response(self, reasoning_content, regular_content, skip_think=False):
        html_parts = []

        if reasoning_content and not skip_think:
            reasoning_escaped = html.escape(reasoning_content).replace("\n", "<br>")
            think_html = (
                    "<details open><summary style='cursor:pointer;font-weight:bold;color:#007acc;'>Thinking</summary>"
                    "<div style='color:#555555;line-height:1.6;padding:15px;border-left:4px solid #007acc;margin:10px 0;background-color:#f0f7ff;border-radius:4px;'>"
                    + reasoning_escaped +
                    "</div></details>"
            )
            html_parts.append(think_html)

        if regular_content:
            content_escaped = html.escape(regular_content).replace("\n", "<br>")
            content_html = f"<div style='margin:0.5em 0; white-space: pre-wrap; line-height:1.6;'>{content_escaped}</div>"
            html_parts.append(content_html)

        return "".join(html_parts)

    def _build_messages(self, raw_hist, sys_prompt):
        msgs = []
        if sys_prompt.strip():
            msgs.append({"role": "system", "content": sys_prompt.strip()})

        for h in raw_hist:
            if h["role"] == "user":
                msgs.append({"role": "user", "content": h["content"]})
            else:
                msg = {"role": "assistant", "content": h.get("content", "")}
                if h.get("reasoning_content"):
                    msg["reasoning_content"] = h.get("reasoning_content")
                msgs.append(msg)
        return msgs

    def stream_generate(self, raw_hist, sys_prompt, thinking_enabled=True, temperature=1.0):
        global stop_generation
        stop_generation = False
        msgs = self._build_messages(raw_hist, sys_prompt)
        self.reset_state()

        try:
            for delta in stream_from_vllm(msgs, thinking_enabled, temperature):
                if stop_generation:
                    break

                if hasattr(delta, 'content') and delta.content:
                    self.accumulated_content += delta.content

                if hasattr(delta, 'reasoning_content') and delta.reasoning_content:
                    self.accumulated_reasoning += delta.reasoning_content

                yield self._render_response(self.accumulated_reasoning, self.accumulated_content, not thinking_enabled)

        except Exception as e:
            yield self._render_response("", f"Error: {str(e)}")


glm45 = GLM45Model()


def chat(msg, raw_hist, sys_prompt, thinking_enabled, temperature):
    global stop_generation
    stop_generation = False

    if not msg.strip():
        return raw_hist, copy.deepcopy(raw_hist), ""

    if raw_hist is None:
        raw_hist = []

    raw_hist.append({"role": "user", "content": msg.strip()})
    place = {
        "role": "assistant",
        "content": "",
        "reasoning_content": ""
    }
    raw_hist.append(place)

    yield raw_hist, copy.deepcopy(raw_hist), ""

    try:
        for chunk in glm45.stream_generate(raw_hist[:-1], sys_prompt, thinking_enabled, temperature):
            if stop_generation:
                break

            place["content"] = glm45.accumulated_content
            place["reasoning_content"] = glm45.accumulated_reasoning
            place["display_content"] = chunk

            yield raw_hist, copy.deepcopy(raw_hist), ""
    except Exception as e:
        place["content"] = f"Error: {str(e)}"
        place["display_content"] = f"<div style='color: red;'>Error: {html.escape(str(e))}</div>"
        yield raw_hist, copy.deepcopy(raw_hist), ""


def reset():
    global stop_generation
    stop_generation = True
    time.sleep(0.1)
    return [], [], ""


def format_history_for_display(raw_hist):
    display_hist = []
    for msg in raw_hist:
        if msg["role"] == "user":
            display_hist.append({"role": "user", "content": msg["content"]})
        else:
            content = msg.get("display_content", msg.get("content", ""))
            display_hist.append({"role": "assistant", "content": content})
    return display_hist


demo = gr.Blocks(title="GLM-4.5 API Demo", theme=gr.themes.Soft())

with demo:
    gr.HTML(
        "<div style='text-align:center;font-size:32px;font-weight:bold;margin-bottom:10px;'>GLM-4.5 API Demo</div>"
        "<div style='text-align:center;color:red;font-size:16px;margin-bottom:20px;'>"
        "This demo uses the API version of the service for faster response speeds.<br>"
        "Only chat functionality with 64K token length is supported. For tool usage, MCP support, and web search, please refer to the API documentation.</div>"
        "<div style='text-align:center;'><a href='https://modelscope.cn/collections/GLM-45-b8693e2a08984f'>Model</a> | "
        "<a href='https://github.com/zai-org/GLM-4.5'>Github</a> | "
        "<a href='http://z.ai/blog/glm-4.5'>Blog</a> | "
        "<a href='https://docs.bigmodel.cn/cn/guide/models/text/glm-4.5'>API Docs</a></div>"
    )

    raw_history = gr.State([])

    with gr.Row():
        with gr.Column(scale=7):
            chatbox = gr.Chatbot(
                label="Chat",
                type="messages",
                height=600,
                elem_classes="chatbot-container",
                sanitize_html=False,
                line_breaks=True
            )
            textbox = gr.Textbox(label="Message", lines=3)
            with gr.Row():
                send = gr.Button("Send", variant="primary")
                clear = gr.Button("Clear")
        with gr.Column(scale=1):
            thinking_toggle = gr.Checkbox(label="Enable Thinking", value=True)
            gr.HTML(
                "<div style='color:red;font-size:12px;margin-top:5px;margin-bottom:15px;'>"
                "Enabled: Activates the model's thinking capability. The model will decide whether to think based on the situation and may return empty thinking content.<br>"
                "Disabled: Disables the model's thinking capability. The model will answer questions directly without reasoning."
                "</div>"
            )
            temperature_slider = gr.Slider(
                minimum=0.0,
                maximum=1.0,
                value=1.0,
                step=0.01,
                label="Temperature"
            )
            sys = gr.Textbox(label="System Prompt", lines=6)


    def chat_wrapper(msg, raw_hist, sys_prompt, thinking_enabled, temperature):
        for hist, raw_hist_updated, textbox_value in chat(msg, raw_hist, sys_prompt, thinking_enabled, temperature):
            display_hist = format_history_for_display(hist)
            yield display_hist, raw_hist_updated, textbox_value


    send.click(
        chat_wrapper,
        inputs=[textbox, raw_history, sys, thinking_toggle, temperature_slider],
        outputs=[chatbox, raw_history, textbox]
    )
    textbox.submit(
        chat_wrapper,
        inputs=[textbox, raw_history, sys, thinking_toggle, temperature_slider],
        outputs=[chatbox, raw_history, textbox]
    )
    clear.click(
        reset,
        outputs=[chatbox, raw_history, textbox]
    )

if __name__ == "__main__":
    demo.queue(max_size=None, default_concurrency_limit=None)
    demo.launch()