Spaces:
Running
Running
| import copy | |
| import time | |
| import html | |
| from openai import OpenAI | |
| import gradio as gr | |
| stop_generation = False | |
| def stream_from_vllm(messages, thinking_enabled=True, temperature=1.0): | |
| global stop_generation | |
| client = OpenAI() | |
| response = client.chat.completions.create( | |
| model="GLM-4.5", | |
| messages=messages, | |
| temperature=temperature, | |
| stream=True, | |
| max_tokens=65536, | |
| extra_body={ | |
| "thinking": { | |
| "type": "enabled" if thinking_enabled else "disabled", | |
| } | |
| } | |
| ) | |
| for chunk in response: | |
| if stop_generation: | |
| break | |
| if chunk.choices and chunk.choices[0].delta: | |
| yield chunk.choices[0].delta | |
| class GLM45Model: | |
| def __init__(self): | |
| self.accumulated_content = "" | |
| self.accumulated_reasoning = "" | |
| def reset_state(self): | |
| self.accumulated_content = "" | |
| self.accumulated_reasoning = "" | |
| def _render_response(self, reasoning_content, regular_content, skip_think=False): | |
| html_parts = [] | |
| if reasoning_content and not skip_think: | |
| reasoning_escaped = html.escape(reasoning_content).replace("\n", "<br>") | |
| think_html = ( | |
| "<details open><summary style='cursor:pointer;font-weight:bold;color:#007acc;'>Thinking</summary>" | |
| "<div style='color:#555555;line-height:1.6;padding:15px;border-left:4px solid #007acc;margin:10px 0;background-color:#f0f7ff;border-radius:4px;'>" | |
| + reasoning_escaped + | |
| "</div></details>" | |
| ) | |
| html_parts.append(think_html) | |
| if regular_content: | |
| content_escaped = html.escape(regular_content).replace("\n", "<br>") | |
| content_html = f"<div style='margin:0.5em 0; white-space: pre-wrap; line-height:1.6;'>{content_escaped}</div>" | |
| html_parts.append(content_html) | |
| return "".join(html_parts) | |
| def _build_messages(self, raw_hist, sys_prompt): | |
| msgs = [] | |
| if sys_prompt.strip(): | |
| msgs.append({"role": "system", "content": sys_prompt.strip()}) | |
| for h in raw_hist: | |
| if h["role"] == "user": | |
| msgs.append({"role": "user", "content": h["content"]}) | |
| else: | |
| msg = {"role": "assistant", "content": h.get("content", "")} | |
| if h.get("reasoning_content"): | |
| msg["reasoning_content"] = h.get("reasoning_content") | |
| msgs.append(msg) | |
| return msgs | |
| def stream_generate(self, raw_hist, sys_prompt, thinking_enabled=True, temperature=1.0): | |
| global stop_generation | |
| stop_generation = False | |
| msgs = self._build_messages(raw_hist, sys_prompt) | |
| self.reset_state() | |
| try: | |
| for delta in stream_from_vllm(msgs, thinking_enabled, temperature): | |
| if stop_generation: | |
| break | |
| if hasattr(delta, 'content') and delta.content: | |
| self.accumulated_content += delta.content | |
| if hasattr(delta, 'reasoning_content') and delta.reasoning_content: | |
| self.accumulated_reasoning += delta.reasoning_content | |
| yield self._render_response(self.accumulated_reasoning, self.accumulated_content, not thinking_enabled) | |
| except Exception as e: | |
| yield self._render_response("", f"Error: {str(e)}") | |
| glm45 = GLM45Model() | |
| def chat(msg, raw_hist, sys_prompt, thinking_enabled, temperature): | |
| global stop_generation | |
| stop_generation = False | |
| if not msg.strip(): | |
| return raw_hist, copy.deepcopy(raw_hist), "" | |
| if raw_hist is None: | |
| raw_hist = [] | |
| raw_hist.append({"role": "user", "content": msg.strip()}) | |
| place = { | |
| "role": "assistant", | |
| "content": "", | |
| "reasoning_content": "" | |
| } | |
| raw_hist.append(place) | |
| yield raw_hist, copy.deepcopy(raw_hist), "" | |
| try: | |
| for chunk in glm45.stream_generate(raw_hist[:-1], sys_prompt, thinking_enabled, temperature): | |
| if stop_generation: | |
| break | |
| place["content"] = glm45.accumulated_content | |
| place["reasoning_content"] = glm45.accumulated_reasoning | |
| place["display_content"] = chunk | |
| yield raw_hist, copy.deepcopy(raw_hist), "" | |
| except Exception as e: | |
| place["content"] = f"Error: {str(e)}" | |
| place["display_content"] = f"<div style='color: red;'>Error: {html.escape(str(e))}</div>" | |
| yield raw_hist, copy.deepcopy(raw_hist), "" | |
| def reset(): | |
| global stop_generation | |
| stop_generation = True | |
| time.sleep(0.1) | |
| return [], [], "" | |
| def format_history_for_display(raw_hist): | |
| display_hist = [] | |
| for msg in raw_hist: | |
| if msg["role"] == "user": | |
| display_hist.append({"role": "user", "content": msg["content"]}) | |
| else: | |
| content = msg.get("display_content", msg.get("content", "")) | |
| display_hist.append({"role": "assistant", "content": content}) | |
| return display_hist | |
| demo = gr.Blocks(title="GLM-4.5 API Demo", theme=gr.themes.Soft()) | |
| with demo: | |
| gr.HTML( | |
| "<div style='text-align:center;font-size:32px;font-weight:bold;margin-bottom:10px;'>GLM-4.5 API Demo</div>" | |
| "<div style='text-align:center;color:red;font-size:16px;margin-bottom:20px;'>" | |
| "This demo uses the API version of the service for faster response speeds.<br>" | |
| "Only chat functionality with 64K token length is supported. For tool usage, MCP support, and web search, please refer to the API documentation.</div>" | |
| "<div style='text-align:center;'><a href='https://modelscope.cn/collections/GLM-45-b8693e2a08984f'>Model</a> | " | |
| "<a href='https://github.com/zai-org/GLM-4.5'>Github</a> | " | |
| "<a href='http://z.ai/blog/glm-4.5'>Blog</a> | " | |
| "<a href='https://docs.bigmodel.cn/cn/guide/models/text/glm-4.5'>API Docs</a></div>" | |
| ) | |
| raw_history = gr.State([]) | |
| with gr.Row(): | |
| with gr.Column(scale=7): | |
| chatbox = gr.Chatbot( | |
| label="Chat", | |
| type="messages", | |
| height=600, | |
| elem_classes="chatbot-container", | |
| sanitize_html=False, | |
| line_breaks=True | |
| ) | |
| textbox = gr.Textbox(label="Message", lines=3) | |
| with gr.Row(): | |
| send = gr.Button("Send", variant="primary") | |
| clear = gr.Button("Clear") | |
| with gr.Column(scale=1): | |
| thinking_toggle = gr.Checkbox(label="Enable Thinking", value=True) | |
| gr.HTML( | |
| "<div style='color:red;font-size:12px;margin-top:5px;margin-bottom:15px;'>" | |
| "Enabled: Activates the model's thinking capability. The model will decide whether to think based on the situation and may return empty thinking content.<br>" | |
| "Disabled: Disables the model's thinking capability. The model will answer questions directly without reasoning." | |
| "</div>" | |
| ) | |
| temperature_slider = gr.Slider( | |
| minimum=0.0, | |
| maximum=1.0, | |
| value=1.0, | |
| step=0.01, | |
| label="Temperature" | |
| ) | |
| sys = gr.Textbox(label="System Prompt", lines=6) | |
| def chat_wrapper(msg, raw_hist, sys_prompt, thinking_enabled, temperature): | |
| for hist, raw_hist_updated, textbox_value in chat(msg, raw_hist, sys_prompt, thinking_enabled, temperature): | |
| display_hist = format_history_for_display(hist) | |
| yield display_hist, raw_hist_updated, textbox_value | |
| send.click( | |
| chat_wrapper, | |
| inputs=[textbox, raw_history, sys, thinking_toggle, temperature_slider], | |
| outputs=[chatbox, raw_history, textbox] | |
| ) | |
| textbox.submit( | |
| chat_wrapper, | |
| inputs=[textbox, raw_history, sys, thinking_toggle, temperature_slider], | |
| outputs=[chatbox, raw_history, textbox] | |
| ) | |
| clear.click( | |
| reset, | |
| outputs=[chatbox, raw_history, textbox] | |
| ) | |
| if __name__ == "__main__": | |
| demo.queue(max_size=None, default_concurrency_limit=None) | |
| demo.launch() |