File size: 7,662 Bytes
67199da
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
26295fb
67199da
 
 
 
 
 
 
 
 
 
 
445847a
67199da
 
 
 
 
 
 
 
 
 
 
 
 
 
26295fb
67199da
 
 
 
 
 
 
 
 
 
 
 
445847a
67199da
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
445847a
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
import copy
import re
import time
import html
from openai import OpenAI
import gradio as gr

stop_generation = False


def stream_from_vllm(messages, thinking_enabled=True, temperature=1.0):
    global stop_generation
    client = OpenAI()

    response = client.chat.completions.create(
        model="glm-4.5",
        messages=messages,
        temperature=temperature,
        stream=True,
        max_tokens=32000,
        extra_body={
            "thinking":
                {
                    "type": "enabled" if thinking_enabled else "disabled",
                }
        }
    )

    print(response)
    for chunk in response:
        if stop_generation:
            break

        if chunk.choices and chunk.choices[0].delta:
            delta = chunk.choices[0].delta
            yield delta


class GLM45Model:
    def _strip_html(self, text: str) -> str:
        return re.sub(r"<[^>]+>", "", text).strip()

    def _wrap_text(self, text: str):
        return [{"type": "text", "text": text}]

    def _stream_fragment(self, reasoning_content: str = "", content: str = "", skip_think: bool = False):
        think_html = ""
        if reasoning_content and not skip_think:
            think_content = html.escape(reasoning_content).replace("\n", "<br>")
            think_html = (
                    "<details open><summary style='cursor:pointer;font-weight:bold;color:#007acc;'>💭 Thinking</summary>"
                    "<div style='color:#555555;line-height:1.6;padding:15px;border-left:4px solid #007acc;margin:10px 0;background-color:#f0f7ff;border-radius:4px;'>"
                    + think_content
                    + "</div></details>"
            )

        answer_html = ""
        if content:
            content_escaped = html.escape(content)
            content_formatted = content_escaped.replace("\n", "<br>")
            answer_html = f"<div style='margin:0.5em 0; white-space: pre-wrap; line-height:1.6;'>{content_formatted}</div>"

        return think_html + answer_html

    def _build_messages(self, raw_hist, sys_prompt):
        msgs = []
        if sys_prompt.strip():
            msgs.append({"role": "system", "content": [{"type": "text", "text": sys_prompt.strip()}]})
        for h in raw_hist:
            if h["role"] == "user":
                msgs.append({"role": "user", "content": self._wrap_text(h["content"])})
            else:
                raw = re.sub(r"<details.*?</details>", "", h["content"], flags=re.DOTALL)
                clean_content = self._strip_html(raw).strip()
                if clean_content:
                    msgs.append({"role": "assistant", "content": self._wrap_text(clean_content)})
        return msgs

    def stream_generate(self, raw_hist, sys_prompt: str, thinking_enabled: bool = True, temperature: float = 1.0):
        global stop_generation
        stop_generation = False
        msgs = self._build_messages(raw_hist, sys_prompt)
        reasoning_buffer = ""
        content_buffer = ""

        try:
            for delta in stream_from_vllm(msgs, thinking_enabled, temperature):
                if stop_generation:
                    break

                if hasattr(delta, 'reasoning_content') and delta.reasoning_content:
                    reasoning_buffer += delta.reasoning_content
                elif hasattr(delta, 'content') and delta.content:
                    content_buffer += delta.content
                else:
                    if isinstance(delta, dict):
                        if 'reasoning_content' in delta and delta['reasoning_content']:
                            reasoning_buffer += delta['reasoning_content']
                        if 'content' in delta and delta['content']:
                            content_buffer += delta['content']
                    elif hasattr(delta, 'content') and delta.content:
                        content_buffer += delta.content

                yield self._stream_fragment(reasoning_buffer, content_buffer, not thinking_enabled)

        except Exception as e:
            error_msg = f"Error during streaming: {str(e)}"
            yield self._stream_fragment("", error_msg)


glm45 = GLM45Model()


def chat(msg, raw_hist, sys_prompt, thinking_enabled, temperature):
    global stop_generation
    stop_generation = False

    if not msg.strip():
        return raw_hist, copy.deepcopy(raw_hist), ""

    user_rec = {"role": "user", "content": msg.strip()}
    if raw_hist is None:
        raw_hist = []
    raw_hist.append(user_rec)
    place = {"role": "assistant", "content": ""}
    raw_hist.append(place)

    yield raw_hist, copy.deepcopy(raw_hist), ""

    try:
        for chunk in glm45.stream_generate(raw_hist[:-1], sys_prompt, thinking_enabled, temperature):
            if stop_generation:
                break
            place["content"] = chunk
            yield raw_hist, copy.deepcopy(raw_hist), ""
    except Exception as e:
        error_content = f"<div style='color: red;'>Error: {html.escape(str(e))}</div>"
        place["content"] = error_content
        yield raw_hist, copy.deepcopy(raw_hist), ""

    yield raw_hist, copy.deepcopy(raw_hist), ""


def reset():
    global stop_generation
    stop_generation = True
    time.sleep(0.1)
    return [], [], ""


demo = gr.Blocks(title="GLM-4.5 API Space", theme=gr.themes.Soft())

with demo:
    gr.HTML(
        "<div style='text-align:center;font-size:32px;font-weight:bold;margin-bottom:10px;'>GLM-4.5 API Space</div>"
        "<div style='text-align:center;color:red;font-size:16px;margin-bottom:20px;'>"
        "This space uses the API version of the service for faster response.<br>"
        "Chat only. For tool use, MCP support, and web search, please refer to the API.</div>"
        "<div style='text-align:center;'><a href='https://huggingface.co/THUDM/GLM-4.5'>Model Hub</a> | "
        "<a href='https://github.com/THUDM/GLM-4.5'>Github</a> | "
        "<a href='https://www.bigmodel.cn'>API</a></div>"
    )
    raw_history = gr.State([])

    with gr.Row():
        with gr.Column(scale=7):
            chatbox = gr.Chatbot(
                label="Chat",
                type="messages",
                height=600,
                elem_classes="chatbot-container",
                sanitize_html=False,
                line_breaks=True
            )
            textbox = gr.Textbox(label="Message", lines=3)
            with gr.Row():
                send = gr.Button("Send", variant="primary")
                clear = gr.Button("Clear")
        with gr.Column(scale=1):
            thinking_toggle = gr.Checkbox(label="Enable Thinking", value=True)
            gr.HTML(
                "<div style='color:red;font-size:12px;margin-top:5px;margin-bottom:15px;'>"
                "ON: Enable model thinking.<br>"
                "OFF: Not enable model thinking, the model will directly answer the question without reasoning."
                "</div>"
            )
            temperature_slider = gr.Slider(
                minimum=0.0,
                maximum=1.0,
                value=1.0,
                step=0.01,
                label="Temperature"
            )
            sys = gr.Textbox(label="System Prompt", lines=6)

    send.click(
        chat,
        inputs=[textbox, raw_history, sys, thinking_toggle, temperature_slider],
        outputs=[chatbox, raw_history, textbox]
    )
    textbox.submit(
        chat,
        inputs=[textbox, raw_history, sys, thinking_toggle, temperature_slider],
        outputs=[chatbox, raw_history, textbox]
    )
    clear.click(
        reset,
        outputs=[chatbox, raw_history, textbox]
    )

if __name__ == "__main__":
    demo.launch()