import copy import re import time import html from openai import OpenAI import gradio as gr stop_generation = False def stream_from_vllm(messages, thinking_enabled=True, temperature=1.0): global stop_generation client = OpenAI() response = client.chat.completions.create( model="glm-4.5", messages=messages, temperature=temperature, stream=True, max_tokens=32000, extra_body={ "thinking": { "type": "enabled" if thinking_enabled else "disabled", } } ) print(response) for chunk in response: if stop_generation: break if chunk.choices and chunk.choices[0].delta: delta = chunk.choices[0].delta yield delta class GLM45Model: def _strip_html(self, text: str) -> str: return re.sub(r"<[^>]+>", "", text).strip() def _wrap_text(self, text: str): return [{"type": "text", "text": text}] def _stream_fragment(self, reasoning_content: str = "", content: str = "", skip_think: bool = False): think_html = "" if reasoning_content and not skip_think: think_content = html.escape(reasoning_content).replace("\n", "
") think_html = ( "
💭 Thinking" "
" + think_content + "
" ) answer_html = "" if content: content_escaped = html.escape(content) content_formatted = content_escaped.replace("\n", "
") answer_html = f"
{content_formatted}
" return think_html + answer_html def _build_messages(self, raw_hist, sys_prompt): msgs = [] if sys_prompt.strip(): msgs.append({"role": "system", "content": [{"type": "text", "text": sys_prompt.strip()}]}) for h in raw_hist: if h["role"] == "user": msgs.append({"role": "user", "content": self._wrap_text(h["content"])}) else: raw = re.sub(r"", "", h["content"], flags=re.DOTALL) clean_content = self._strip_html(raw).strip() if clean_content: msgs.append({"role": "assistant", "content": self._wrap_text(clean_content)}) return msgs def stream_generate(self, raw_hist, sys_prompt: str, thinking_enabled: bool = True, temperature: float = 1.0): global stop_generation stop_generation = False msgs = self._build_messages(raw_hist, sys_prompt) reasoning_buffer = "" content_buffer = "" try: for delta in stream_from_vllm(msgs, thinking_enabled, temperature): if stop_generation: break if hasattr(delta, 'reasoning_content') and delta.reasoning_content: reasoning_buffer += delta.reasoning_content elif hasattr(delta, 'content') and delta.content: content_buffer += delta.content else: if isinstance(delta, dict): if 'reasoning_content' in delta and delta['reasoning_content']: reasoning_buffer += delta['reasoning_content'] if 'content' in delta and delta['content']: content_buffer += delta['content'] elif hasattr(delta, 'content') and delta.content: content_buffer += delta.content yield self._stream_fragment(reasoning_buffer, content_buffer, not thinking_enabled) except Exception as e: error_msg = f"Error during streaming: {str(e)}" yield self._stream_fragment("", error_msg) glm45 = GLM45Model() def chat(msg, raw_hist, sys_prompt, thinking_enabled, temperature): global stop_generation stop_generation = False if not msg.strip(): return raw_hist, copy.deepcopy(raw_hist), "" user_rec = {"role": "user", "content": msg.strip()} if raw_hist is None: raw_hist = [] raw_hist.append(user_rec) place = {"role": "assistant", "content": ""} raw_hist.append(place) yield raw_hist, copy.deepcopy(raw_hist), "" try: for chunk in glm45.stream_generate(raw_hist[:-1], sys_prompt, thinking_enabled, temperature): if stop_generation: break place["content"] = chunk yield raw_hist, copy.deepcopy(raw_hist), "" except Exception as e: error_content = f"
Error: {html.escape(str(e))}
" place["content"] = error_content yield raw_hist, copy.deepcopy(raw_hist), "" yield raw_hist, copy.deepcopy(raw_hist), "" def reset(): global stop_generation stop_generation = True time.sleep(0.1) return [], [], "" demo = gr.Blocks(title="GLM-4.5 API Space", theme=gr.themes.Soft()) with demo: gr.HTML( "
GLM-4.5 API Space
" "
" "This space uses the API version of the service for faster response.
" "Chat only. For tool use, MCP support, and web search, please refer to the API.
" "
Model Hub | " "Github | " "API
" ) raw_history = gr.State([]) with gr.Row(): with gr.Column(scale=7): chatbox = gr.Chatbot( label="Chat", type="messages", height=600, elem_classes="chatbot-container", sanitize_html=False, line_breaks=True ) textbox = gr.Textbox(label="Message", lines=3) with gr.Row(): send = gr.Button("Send", variant="primary") clear = gr.Button("Clear") with gr.Column(scale=1): thinking_toggle = gr.Checkbox(label="Enable Thinking", value=True) gr.HTML( "
" "ON: Enable model thinking.
" "OFF: Not enable model thinking, the model will directly answer the question without reasoning." "
" ) temperature_slider = gr.Slider( minimum=0.0, maximum=1.0, value=1.0, step=0.01, label="Temperature" ) sys = gr.Textbox(label="System Prompt", lines=6) send.click( chat, inputs=[textbox, raw_history, sys, thinking_toggle, temperature_slider], outputs=[chatbox, raw_history, textbox] ) textbox.submit( chat, inputs=[textbox, raw_history, sys, thinking_toggle, temperature_slider], outputs=[chatbox, raw_history, textbox] ) clear.click( reset, outputs=[chatbox, raw_history, textbox] ) if __name__ == "__main__": demo.launch()