Spaces:

zai-org
/

GLM-4.5-Space

Running

GLM-4.5-Space / app.py

zRzRzRzRzRzRzR

26295fb 21 days ago

7.66 kB

	import copy
	import re
	import time
	import html
	from openai import OpenAI
	import gradio as gr

	stop_generation = False


	def stream_from_vllm(messages, thinking_enabled=True, temperature=1.0):
	global stop_generation
	client = OpenAI()

	response = client.chat.completions.create(
	model="glm-4.5",
	messages=messages,
	temperature=temperature,
	stream=True,
	max_tokens=32000,
	extra_body={
	"thinking":
	{
	"type": "enabled" if thinking_enabled else "disabled",
	}
	}
	)

	print(response)
	for chunk in response:
	if stop_generation:
	break

	if chunk.choices and chunk.choices[0].delta:
	delta = chunk.choices[0].delta
	yield delta


	class GLM45Model:
	def _strip_html(self, text: str) -> str:
	return re.sub(r"<[^>]+>", "", text).strip()

	def _wrap_text(self, text: str):
	return [{"type": "text", "text": text}]

	def _stream_fragment(self, reasoning_content: str = "", content: str = "", skip_think: bool = False):
	think_html = ""
	if reasoning_content and not skip_think:
	think_content = html.escape(reasoning_content).replace("\n", "<br>")
	think_html = (
	"<details open><summary style='cursor:pointer;font-weight:bold;color:#007acc;'>💭 Thinking</summary>"
	"<div style='color:#555555;line-height:1.6;padding:15px;border-left:4px solid #007acc;margin:10px 0;background-color:#f0f7ff;border-radius:4px;'>"
	+ think_content
	+ "</div></details>"
	)

	answer_html = ""
	if content:
	content_escaped = html.escape(content)
	content_formatted = content_escaped.replace("\n", "<br>")
	answer_html = f"<div style='margin:0.5em 0; white-space: pre-wrap; line-height:1.6;'>{content_formatted}</div>"

	return think_html + answer_html

	def _build_messages(self, raw_hist, sys_prompt):
	msgs = []
	if sys_prompt.strip():
	msgs.append({"role": "system", "content": [{"type": "text", "text": sys_prompt.strip()}]})
	for h in raw_hist:
	if h["role"] == "user":
	msgs.append({"role": "user", "content": self._wrap_text(h["content"])})
	else:
	raw = re.sub(r"<details.*?</details>", "", h["content"], flags=re.DOTALL)
	clean_content = self._strip_html(raw).strip()
	if clean_content:
	msgs.append({"role": "assistant", "content": self._wrap_text(clean_content)})
	return msgs

	def stream_generate(self, raw_hist, sys_prompt: str, thinking_enabled: bool = True, temperature: float = 1.0):
	global stop_generation
	stop_generation = False
	msgs = self._build_messages(raw_hist, sys_prompt)
	reasoning_buffer = ""
	content_buffer = ""

	try:
	for delta in stream_from_vllm(msgs, thinking_enabled, temperature):
	if stop_generation:
	break

	if hasattr(delta, 'reasoning_content') and delta.reasoning_content:
	reasoning_buffer += delta.reasoning_content
	elif hasattr(delta, 'content') and delta.content:
	content_buffer += delta.content
	else:
	if isinstance(delta, dict):
	if 'reasoning_content' in delta and delta['reasoning_content']:
	reasoning_buffer += delta['reasoning_content']
	if 'content' in delta and delta['content']:
	content_buffer += delta['content']
	elif hasattr(delta, 'content') and delta.content:
	content_buffer += delta.content

	yield self._stream_fragment(reasoning_buffer, content_buffer, not thinking_enabled)

	except Exception as e:
	error_msg = f"Error during streaming: {str(e)}"
	yield self._stream_fragment("", error_msg)


	glm45 = GLM45Model()


	def chat(msg, raw_hist, sys_prompt, thinking_enabled, temperature):
	global stop_generation
	stop_generation = False

	if not msg.strip():
	return raw_hist, copy.deepcopy(raw_hist), ""

	user_rec = {"role": "user", "content": msg.strip()}
	if raw_hist is None:
	raw_hist = []
	raw_hist.append(user_rec)
	place = {"role": "assistant", "content": ""}
	raw_hist.append(place)

	yield raw_hist, copy.deepcopy(raw_hist), ""

	try:
	for chunk in glm45.stream_generate(raw_hist[:-1], sys_prompt, thinking_enabled, temperature):
	if stop_generation:
	break
	place["content"] = chunk
	yield raw_hist, copy.deepcopy(raw_hist), ""
	except Exception as e:
	error_content = f"<div style='color: red;'>Error: {html.escape(str(e))}</div>"
	place["content"] = error_content
	yield raw_hist, copy.deepcopy(raw_hist), ""

	yield raw_hist, copy.deepcopy(raw_hist), ""


	def reset():
	global stop_generation
	stop_generation = True
	time.sleep(0.1)
	return [], [], ""


	demo = gr.Blocks(title="GLM-4.5 API Space", theme=gr.themes.Soft())

	with demo:
	gr.HTML(
	"<div style='text-align:center;font-size:32px;font-weight:bold;margin-bottom:10px;'>GLM-4.5 API Space</div>"
	"<div style='text-align:center;color:red;font-size:16px;margin-bottom:20px;'>"
	"This space uses the API version of the service for faster response.<br>"
	"Chat only. For tool use, MCP support, and web search, please refer to the API.</div>"
	"<div style='text-align:center;'><a href='https://huggingface.co/THUDM/GLM-4.5'>Model Hub</a> \| "
	"<a href='https://github.com/THUDM/GLM-4.5'>Github</a> \| "
	"<a href='https://www.bigmodel.cn'>API</a></div>"
	)
	raw_history = gr.State([])

	with gr.Row():
	with gr.Column(scale=7):
	chatbox = gr.Chatbot(
	label="Chat",
	type="messages",
	height=600,
	elem_classes="chatbot-container",
	sanitize_html=False,
	line_breaks=True
	)
	textbox = gr.Textbox(label="Message", lines=3)
	with gr.Row():
	send = gr.Button("Send", variant="primary")
	clear = gr.Button("Clear")
	with gr.Column(scale=1):
	thinking_toggle = gr.Checkbox(label="Enable Thinking", value=True)
	gr.HTML(
	"<div style='color:red;font-size:12px;margin-top:5px;margin-bottom:15px;'>"
	"ON: Enable model thinking.<br>"
	"OFF: Not enable model thinking, the model will directly answer the question without reasoning."
	"</div>"
	)
	temperature_slider = gr.Slider(
	minimum=0.0,
	maximum=1.0,
	value=1.0,
	step=0.01,
	label="Temperature"
	)
	sys = gr.Textbox(label="System Prompt", lines=6)

	send.click(
	chat,
	inputs=[textbox, raw_history, sys, thinking_toggle, temperature_slider],
	outputs=[chatbox, raw_history, textbox]
	)
	textbox.submit(
	chat,
	inputs=[textbox, raw_history, sys, thinking_toggle, temperature_slider],
	outputs=[chatbox, raw_history, textbox]
	)
	clear.click(
	reset,
	outputs=[chatbox, raw_history, textbox]
	)

	if __name__ == "__main__":
	demo.launch()