Spaces:

zai-org
/

GLM-4.5-Space

Running

File size: 7,662 Bytes

import copy
import re
import time
import html
from openai import OpenAI
import gradio as gr

stop_generation = False


def stream_from_vllm(messages, thinking_enabled=True, temperature=1.0):
    global stop_generation
    client = OpenAI()

    response = client.chat.completions.create(
        model="glm-4.5",
        messages=messages,
        temperature=temperature,
        stream=True,
        max_tokens=32000,
        extra_body={
            "thinking":
                {
                    "type": "enabled" if thinking_enabled else "disabled",
                }
        }
    )

    print(response)
    for chunk in response:
        if stop_generation:
            break

        if chunk.choices and chunk.choices[0].delta:
            delta = chunk.choices[0].delta
            yield delta


class GLM45Model:
    def _strip_html(self, text: str) -> str:
        return re.sub(r"<[^>]+>", "", text).strip()

    def _wrap_text(self, text: str):
        return [{"type": "text", "text": text}]

    def _stream_fragment(self, reasoning_content: str = "", content: str = "", skip_think: bool = False):
        think_html = ""
        if reasoning_content and not skip_think:
            think_content = html.escape(reasoning_content).replace("\n", "<br>")
            think_html = (
                    "<details open><summary style='cursor:pointer;font-weight:bold;color:#007acc;'>💭 Thinking</summary>"
                    "<div style='color:#555555;line-height:1.6;padding:15px;border-left:4px solid #007acc;margin:10px 0;background-color:#f0f7ff;border-radius:4px;'>"
                    + think_content
                    + "</div></details>"
            )

        answer_html = ""
        if content:
            content_escaped = html.escape(content)
            content_formatted = content_escaped.replace("\n", "<br>")
            answer_html = f"<div style='margin:0.5em 0; white-space: pre-wrap; line-height:1.6;'>{content_formatted}</div>"

        return think_html + answer_html

    def _build_messages(self, raw_hist, sys_prompt):
        msgs = []
        if sys_prompt.strip():
            msgs.append({"role": "system", "content": [{"type": "text", "text": sys_prompt.strip()}]})
        for h in raw_hist:
            if h["role"] == "user":
                msgs.append({"role": "user", "content": self._wrap_text(h["content"])})
            else:
                raw = re.sub(r"<details.*?</details>", "", h["content"], flags=re.DOTALL)
                clean_content = self._strip_html(raw).strip()
                if clean_content:
                    msgs.append({"role": "assistant", "content": self._wrap_text(clean_content)})
        return msgs

    def stream_generate(self, raw_hist, sys_prompt: str, thinking_enabled: bool = True, temperature: float = 1.0):
        global stop_generation
        stop_generation = False
        msgs = self._build_messages(raw_hist, sys_prompt)
        reasoning_buffer = ""
        content_buffer = ""

        try:
            for delta in stream_from_vllm(msgs, thinking_enabled, temperature):
                if stop_generation:
                    break

                if hasattr(delta, 'reasoning_content') and delta.reasoning_content:
                    reasoning_buffer += delta.reasoning_content
                elif hasattr(delta, 'content') and delta.content:
                    content_buffer += delta.content
                else:
                    if isinstance(delta, dict):
                        if 'reasoning_content' in delta and delta['reasoning_content']:
                            reasoning_buffer += delta['reasoning_content']
                        if 'content' in delta and delta['content']:
                            content_buffer += delta['content']
                    elif hasattr(delta, 'content') and delta.content:
                        content_buffer += delta.content

                yield self._stream_fragment(reasoning_buffer, content_buffer, not thinking_enabled)

        except Exception as e:
            error_msg = f"Error during streaming: {str(e)}"
            yield self._stream_fragment("", error_msg)


glm45 = GLM45Model()


def chat(msg, raw_hist, sys_prompt, thinking_enabled, temperature):
    global stop_generation
    stop_generation = False

    if not msg.strip():
        return raw_hist, copy.deepcopy(raw_hist), ""

    user_rec = {"role": "user", "content": msg.strip()}
    if raw_hist is None:
        raw_hist = []
    raw_hist.append(user_rec)
    place = {"role": "assistant", "content": ""}
    raw_hist.append(place)

    yield raw_hist, copy.deepcopy(raw_hist), ""

    try:
        for chunk in glm45.stream_generate(raw_hist[:-1], sys_prompt, thinking_enabled, temperature):
            if stop_generation:
                break
            place["content"] = chunk
            yield raw_hist, copy.deepcopy(raw_hist), ""
    except Exception as e:
        error_content = f"<div style='color: red;'>Error: {html.escape(str(e))}</div>"
        place["content"] = error_content
        yield raw_hist, copy.deepcopy(raw_hist), ""

    yield raw_hist, copy.deepcopy(raw_hist), ""


def reset():
    global stop_generation
    stop_generation = True
    time.sleep(0.1)
    return [], [], ""


demo = gr.Blocks(title="GLM-4.5 API Space", theme=gr.themes.Soft())

with demo:
    gr.HTML(
        "<div style='text-align:center;font-size:32px;font-weight:bold;margin-bottom:10px;'>GLM-4.5 API Space</div>"
        "<div style='text-align:center;color:red;font-size:16px;margin-bottom:20px;'>"
        "This space uses the API version of the service for faster response.<br>"
        "Chat only. For tool use, MCP support, and web search, please refer to the API.</div>"
        "<div style='text-align:center;'><a href='https://huggingface.co/THUDM/GLM-4.5'>Model Hub</a> | "
        "<a href='https://github.com/THUDM/GLM-4.5'>Github</a> | "
        "<a href='https://www.bigmodel.cn'>API</a></div>"
    )
    raw_history = gr.State([])

    with gr.Row():
        with gr.Column(scale=7):
            chatbox = gr.Chatbot(
                label="Chat",
                type="messages",
                height=600,
                elem_classes="chatbot-container",
                sanitize_html=False,
                line_breaks=True
            )
            textbox = gr.Textbox(label="Message", lines=3)
            with gr.Row():
                send = gr.Button("Send", variant="primary")
                clear = gr.Button("Clear")
        with gr.Column(scale=1):
            thinking_toggle = gr.Checkbox(label="Enable Thinking", value=True)
            gr.HTML(
                "<div style='color:red;font-size:12px;margin-top:5px;margin-bottom:15px;'>"
                "ON: Enable model thinking.<br>"
                "OFF: Not enable model thinking, the model will directly answer the question without reasoning."
                "</div>"
            )
            temperature_slider = gr.Slider(
                minimum=0.0,
                maximum=1.0,
                value=1.0,
                step=0.01,
                label="Temperature"
            )
            sys = gr.Textbox(label="System Prompt", lines=6)

    send.click(
        chat,
        inputs=[textbox, raw_history, sys, thinking_toggle, temperature_slider],
        outputs=[chatbox, raw_history, textbox]
    )
    textbox.submit(
        chat,
        inputs=[textbox, raw_history, sys, thinking_toggle, temperature_slider],
        outputs=[chatbox, raw_history, textbox]
    )
    clear.click(
        reset,
        outputs=[chatbox, raw_history, textbox]
    )

if __name__ == "__main__":
    demo.launch()