import copy
import re
import time
import html
from openai import OpenAI
import gradio as gr
stop_generation = False
def stream_from_vllm(messages, thinking_enabled=True, temperature=1.0):
global stop_generation
client = OpenAI()
response = client.chat.completions.create(
model="glm-4.5",
messages=messages,
temperature=temperature,
stream=True,
max_tokens=32000,
extra_body={
"thinking":
{
"type": "enabled" if thinking_enabled else "disabled",
}
}
)
print(response)
for chunk in response:
if stop_generation:
break
if chunk.choices and chunk.choices[0].delta:
delta = chunk.choices[0].delta
yield delta
class GLM45Model:
def _strip_html(self, text: str) -> str:
return re.sub(r"<[^>]+>", "", text).strip()
def _wrap_text(self, text: str):
return [{"type": "text", "text": text}]
def _stream_fragment(self, reasoning_content: str = "", content: str = "", skip_think: bool = False):
think_html = ""
if reasoning_content and not skip_think:
think_content = html.escape(reasoning_content).replace("\n", "
")
think_html = (
"💠Thinking
"
""
+ think_content
+ "
"
)
answer_html = ""
if content:
content_escaped = html.escape(content)
content_formatted = content_escaped.replace("\n", "
")
answer_html = f"
{content_formatted}
"
return think_html + answer_html
def _build_messages(self, raw_hist, sys_prompt):
msgs = []
if sys_prompt.strip():
msgs.append({"role": "system", "content": [{"type": "text", "text": sys_prompt.strip()}]})
for h in raw_hist:
if h["role"] == "user":
msgs.append({"role": "user", "content": self._wrap_text(h["content"])})
else:
raw = re.sub(r"", "", h["content"], flags=re.DOTALL)
clean_content = self._strip_html(raw).strip()
if clean_content:
msgs.append({"role": "assistant", "content": self._wrap_text(clean_content)})
return msgs
def stream_generate(self, raw_hist, sys_prompt: str, thinking_enabled: bool = True, temperature: float = 1.0):
global stop_generation
stop_generation = False
msgs = self._build_messages(raw_hist, sys_prompt)
reasoning_buffer = ""
content_buffer = ""
try:
for delta in stream_from_vllm(msgs, thinking_enabled, temperature):
if stop_generation:
break
if hasattr(delta, 'reasoning_content') and delta.reasoning_content:
reasoning_buffer += delta.reasoning_content
elif hasattr(delta, 'content') and delta.content:
content_buffer += delta.content
else:
if isinstance(delta, dict):
if 'reasoning_content' in delta and delta['reasoning_content']:
reasoning_buffer += delta['reasoning_content']
if 'content' in delta and delta['content']:
content_buffer += delta['content']
elif hasattr(delta, 'content') and delta.content:
content_buffer += delta.content
yield self._stream_fragment(reasoning_buffer, content_buffer, not thinking_enabled)
except Exception as e:
error_msg = f"Error during streaming: {str(e)}"
yield self._stream_fragment("", error_msg)
glm45 = GLM45Model()
def chat(msg, raw_hist, sys_prompt, thinking_enabled, temperature):
global stop_generation
stop_generation = False
if not msg.strip():
return raw_hist, copy.deepcopy(raw_hist), ""
user_rec = {"role": "user", "content": msg.strip()}
if raw_hist is None:
raw_hist = []
raw_hist.append(user_rec)
place = {"role": "assistant", "content": ""}
raw_hist.append(place)
yield raw_hist, copy.deepcopy(raw_hist), ""
try:
for chunk in glm45.stream_generate(raw_hist[:-1], sys_prompt, thinking_enabled, temperature):
if stop_generation:
break
place["content"] = chunk
yield raw_hist, copy.deepcopy(raw_hist), ""
except Exception as e:
error_content = f"Error: {html.escape(str(e))}
"
place["content"] = error_content
yield raw_hist, copy.deepcopy(raw_hist), ""
yield raw_hist, copy.deepcopy(raw_hist), ""
def reset():
global stop_generation
stop_generation = True
time.sleep(0.1)
return [], [], ""
demo = gr.Blocks(title="GLM-4.5 API Space", theme=gr.themes.Soft())
with demo:
gr.HTML(
"GLM-4.5 API Space
"
""
"This space uses the API version of the service for faster response.
"
"Chat only. For tool use, MCP support, and web search, please refer to the API.
"
""
)
raw_history = gr.State([])
with gr.Row():
with gr.Column(scale=7):
chatbox = gr.Chatbot(
label="Chat",
type="messages",
height=600,
elem_classes="chatbot-container",
sanitize_html=False,
line_breaks=True
)
textbox = gr.Textbox(label="Message", lines=3)
with gr.Row():
send = gr.Button("Send", variant="primary")
clear = gr.Button("Clear")
with gr.Column(scale=1):
thinking_toggle = gr.Checkbox(label="Enable Thinking", value=True)
gr.HTML(
""
"ON: Enable model thinking.
"
"OFF: Not enable model thinking, the model will directly answer the question without reasoning."
"
"
)
temperature_slider = gr.Slider(
minimum=0.0,
maximum=1.0,
value=1.0,
step=0.01,
label="Temperature"
)
sys = gr.Textbox(label="System Prompt", lines=6)
send.click(
chat,
inputs=[textbox, raw_history, sys, thinking_toggle, temperature_slider],
outputs=[chatbox, raw_history, textbox]
)
textbox.submit(
chat,
inputs=[textbox, raw_history, sys, thinking_toggle, temperature_slider],
outputs=[chatbox, raw_history, textbox]
)
clear.click(
reset,
outputs=[chatbox, raw_history, textbox]
)
if __name__ == "__main__":
demo.launch()