Spaces:
Running
Running
import copy | |
import re | |
import time | |
import html | |
from openai import OpenAI | |
import gradio as gr | |
stop_generation = False | |
def stream_from_vllm(messages, thinking_enabled=True, temperature=1.0): | |
global stop_generation | |
client = OpenAI() | |
response = client.chat.completions.create( | |
model="glm-4.5", | |
messages=messages, | |
temperature=temperature, | |
stream=True, | |
max_tokens=32000, | |
extra_body={ | |
"thinking": | |
{ | |
"type": "enabled" if thinking_enabled else "disabled", | |
} | |
} | |
) | |
print(response) | |
for chunk in response: | |
if stop_generation: | |
break | |
if chunk.choices and chunk.choices[0].delta: | |
delta = chunk.choices[0].delta | |
yield delta | |
class GLM45Model: | |
def _strip_html(self, text: str) -> str: | |
return re.sub(r"<[^>]+>", "", text).strip() | |
def _wrap_text(self, text: str): | |
return [{"type": "text", "text": text}] | |
def _stream_fragment(self, reasoning_content: str = "", content: str = "", skip_think: bool = False): | |
think_html = "" | |
if reasoning_content and not skip_think: | |
think_content = html.escape(reasoning_content).replace("\n", "<br>") | |
think_html = ( | |
"<details open><summary style='cursor:pointer;font-weight:bold;color:#007acc;'>💭 Thinking</summary>" | |
"<div style='color:#555555;line-height:1.6;padding:15px;border-left:4px solid #007acc;margin:10px 0;background-color:#f0f7ff;border-radius:4px;'>" | |
+ think_content | |
+ "</div></details>" | |
) | |
answer_html = "" | |
if content: | |
content_escaped = html.escape(content) | |
content_formatted = content_escaped.replace("\n", "<br>") | |
answer_html = f"<div style='margin:0.5em 0; white-space: pre-wrap; line-height:1.6;'>{content_formatted}</div>" | |
return think_html + answer_html | |
def _build_messages(self, raw_hist, sys_prompt): | |
msgs = [] | |
if sys_prompt.strip(): | |
msgs.append({"role": "system", "content": [{"type": "text", "text": sys_prompt.strip()}]}) | |
for h in raw_hist: | |
if h["role"] == "user": | |
msgs.append({"role": "user", "content": self._wrap_text(h["content"])}) | |
else: | |
raw = re.sub(r"<details.*?</details>", "", h["content"], flags=re.DOTALL) | |
clean_content = self._strip_html(raw).strip() | |
if clean_content: | |
msgs.append({"role": "assistant", "content": self._wrap_text(clean_content)}) | |
return msgs | |
def stream_generate(self, raw_hist, sys_prompt: str, thinking_enabled: bool = True, temperature: float = 1.0): | |
global stop_generation | |
stop_generation = False | |
msgs = self._build_messages(raw_hist, sys_prompt) | |
reasoning_buffer = "" | |
content_buffer = "" | |
try: | |
for delta in stream_from_vllm(msgs, thinking_enabled, temperature): | |
if stop_generation: | |
break | |
if hasattr(delta, 'reasoning_content') and delta.reasoning_content: | |
reasoning_buffer += delta.reasoning_content | |
elif hasattr(delta, 'content') and delta.content: | |
content_buffer += delta.content | |
else: | |
if isinstance(delta, dict): | |
if 'reasoning_content' in delta and delta['reasoning_content']: | |
reasoning_buffer += delta['reasoning_content'] | |
if 'content' in delta and delta['content']: | |
content_buffer += delta['content'] | |
elif hasattr(delta, 'content') and delta.content: | |
content_buffer += delta.content | |
yield self._stream_fragment(reasoning_buffer, content_buffer, not thinking_enabled) | |
except Exception as e: | |
error_msg = f"Error during streaming: {str(e)}" | |
yield self._stream_fragment("", error_msg) | |
glm45 = GLM45Model() | |
def chat(msg, raw_hist, sys_prompt, thinking_enabled, temperature): | |
global stop_generation | |
stop_generation = False | |
if not msg.strip(): | |
return raw_hist, copy.deepcopy(raw_hist), "" | |
user_rec = {"role": "user", "content": msg.strip()} | |
if raw_hist is None: | |
raw_hist = [] | |
raw_hist.append(user_rec) | |
place = {"role": "assistant", "content": ""} | |
raw_hist.append(place) | |
yield raw_hist, copy.deepcopy(raw_hist), "" | |
try: | |
for chunk in glm45.stream_generate(raw_hist[:-1], sys_prompt, thinking_enabled, temperature): | |
if stop_generation: | |
break | |
place["content"] = chunk | |
yield raw_hist, copy.deepcopy(raw_hist), "" | |
except Exception as e: | |
error_content = f"<div style='color: red;'>Error: {html.escape(str(e))}</div>" | |
place["content"] = error_content | |
yield raw_hist, copy.deepcopy(raw_hist), "" | |
yield raw_hist, copy.deepcopy(raw_hist), "" | |
def reset(): | |
global stop_generation | |
stop_generation = True | |
time.sleep(0.1) | |
return [], [], "" | |
demo = gr.Blocks(title="GLM-4.5 API Space", theme=gr.themes.Soft()) | |
with demo: | |
gr.HTML( | |
"<div style='text-align:center;font-size:32px;font-weight:bold;margin-bottom:10px;'>GLM-4.5 API Space</div>" | |
"<div style='text-align:center;color:red;font-size:16px;margin-bottom:20px;'>" | |
"This space uses the API version of the service for faster response.<br>" | |
"Chat only. For tool use, MCP support, and web search, please refer to the API.</div>" | |
"<div style='text-align:center;'><a href='https://huggingface.co/THUDM/GLM-4.5'>Model Hub</a> | " | |
"<a href='https://github.com/THUDM/GLM-4.5'>Github</a> | " | |
"<a href='https://www.bigmodel.cn'>API</a></div>" | |
) | |
raw_history = gr.State([]) | |
with gr.Row(): | |
with gr.Column(scale=7): | |
chatbox = gr.Chatbot( | |
label="Chat", | |
type="messages", | |
height=600, | |
elem_classes="chatbot-container", | |
sanitize_html=False, | |
line_breaks=True | |
) | |
textbox = gr.Textbox(label="Message", lines=3) | |
with gr.Row(): | |
send = gr.Button("Send", variant="primary") | |
clear = gr.Button("Clear") | |
with gr.Column(scale=1): | |
thinking_toggle = gr.Checkbox(label="Enable Thinking", value=True) | |
gr.HTML( | |
"<div style='color:red;font-size:12px;margin-top:5px;margin-bottom:15px;'>" | |
"ON: Enable model thinking.<br>" | |
"OFF: Not enable model thinking, the model will directly answer the question without reasoning." | |
"</div>" | |
) | |
temperature_slider = gr.Slider( | |
minimum=0.0, | |
maximum=1.0, | |
value=1.0, | |
step=0.01, | |
label="Temperature" | |
) | |
sys = gr.Textbox(label="System Prompt", lines=6) | |
send.click( | |
chat, | |
inputs=[textbox, raw_history, sys, thinking_toggle, temperature_slider], | |
outputs=[chatbox, raw_history, textbox] | |
) | |
textbox.submit( | |
chat, | |
inputs=[textbox, raw_history, sys, thinking_toggle, temperature_slider], | |
outputs=[chatbox, raw_history, textbox] | |
) | |
clear.click( | |
reset, | |
outputs=[chatbox, raw_history, textbox] | |
) | |
if __name__ == "__main__": | |
demo.launch() |