Molbap HF Staff commited on
Commit
0ce686a
·
1 Parent(s): 2f8bacd
Files changed (1) hide show
  1. app.py +195 -0
app.py ADDED
@@ -0,0 +1,195 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import os, json, subprocess, sys, textwrap, tempfile, shlex, pandas as pd
2
+
3
+ import gradio as gr
4
+ import spaces
5
+
6
+ # --- Attention mask visualizer (Transformers) ---
7
+ # Docs show: from transformers.utils.attention_visualizer import AttentionMaskVisualizer
8
+ # Ref: https://huggingface.co/docs/transformers/... pages mention this util.
9
+ def _import_visualizer():
10
+ from transformers.utils.attention_visualizer import AttentionMaskVisualizer # type: ignore[attr-defined]
11
+ return AttentionMaskVisualizer
12
+
13
+ @spaces.GPU(duration=120)
14
+ def run_attention_visualizer(model_id: str, prompt: str) -> str:
15
+ """
16
+ Returns HTML produced by AttentionMaskVisualizer(model_id)(prompt).
17
+ We render it into an HTML component.
18
+ """
19
+ AttentionMaskVisualizer = _import_visualizer()
20
+ vis = AttentionMaskVisualizer(model_id)
21
+ html_or_obj = vis(prompt) # recent Transformers returns embeddable HTML
22
+ return str(html_or_obj)
23
+
24
+ # --- Minimal “terminal” (sandboxed) ---
25
+ def run_shell(cmd: str) -> str:
26
+ # Simple, constrained shell: block backgrounding, pipes, redirects; allow common tooling.
27
+ blocked = any(tok in cmd for tok in ["|", ">", "<", "&&", "||", "`"])
28
+ if blocked:
29
+ return "Blocked characters detected. Use a single command without pipes/redirections."
30
+ try:
31
+ out = subprocess.run(
32
+ cmd, shell=True, check=False, capture_output=True, text=True, timeout=30
33
+ )
34
+ return f"$ {cmd}\n{out.stdout}{out.stderr}"
35
+ except Exception as e:
36
+ return f"$ {cmd}\n{e!r}"
37
+
38
+ # --- KV-cache / CUDA caching allocator profiling ---
39
+ # We launch a short Python program twice (allocator on/off) in a subprocess so the env var takes effect pre-import.
40
+ PROFILE_SNIPPET = r"""
41
+ import os, json, time, torch
42
+ from transformers import AutoTokenizer, AutoModelForCausalLM
43
+
44
+ model_id = os.environ.get("HF_MODEL_ID", "openai-community/gpt2")
45
+ device = "cuda" if torch.cuda.is_available() else "cpu"
46
+ tok = AutoTokenizer.from_pretrained(model_id)
47
+ model = AutoModelForCausalLM.from_pretrained(model_id, torch_dtype=torch.float16 if device=="cuda" else None).to(device)
48
+
49
+ prompt = os.environ.get("HF_PROMPT", "Transformers are great for sequence modeling.")
50
+ steps = int(os.environ.get("HF_STEPS", "32"))
51
+
52
+ inputs = tok(prompt, return_tensors="pt").to(device)
53
+ if device == "cuda":
54
+ torch.cuda.reset_peak_memory_stats()
55
+ torch.cuda.synchronize()
56
+
57
+ def mem():
58
+ if device != "cuda":
59
+ return {"allocated": 0, "reserved": 0}
60
+ return {
61
+ "allocated": int(torch.cuda.memory_allocated()),
62
+ "reserved": int(torch.cuda.memory_reserved()),
63
+ }
64
+
65
+ print(json.dumps({"t": 0, **mem()}), flush=True)
66
+
67
+ # Step-by-step generation to grow KV cache
68
+ past = None
69
+ input_ids = inputs.input_ids
70
+ for i in range(1, steps+1):
71
+ with torch.inference_mode():
72
+ out = model(input_ids=input_ids, use_cache=True, past_key_values=past)
73
+ past = out.past_key_values
74
+ # feed a single token next (use eos or last predicted token if available)
75
+ next_id = torch.tensor([[tok.eos_token_id or tok.pad_token_id or 0]], device=device)
76
+ input_ids = next_id
77
+ if device == "cuda":
78
+ torch.cuda.synchronize()
79
+ print(json.dumps({"t": i, **mem()}), flush=True)
80
+ """
81
+
82
+ def _run_profile_once(model_id: str, prompt: str, steps: int, disable_cache: bool) -> list[dict]:
83
+ env = os.environ.copy()
84
+ env["HF_MODEL_ID"] = model_id
85
+ env["HF_PROMPT"] = prompt
86
+ env["HF_STEPS"] = str(steps)
87
+ # IMPORTANT: set before torch import in the child
88
+ if disable_cache:
89
+ env["PYTORCH_NO_CUDA_MEMORY_CACHING"] = "1"
90
+ with tempfile.NamedTemporaryFile("w", suffix=".py", delete=False) as f:
91
+ f.write(PROFILE_SNIPPET)
92
+ path = f.name
93
+ try:
94
+ p = subprocess.Popen(
95
+ [sys.executable, path],
96
+ stdout=subprocess.PIPE,
97
+ stderr=subprocess.PIPE,
98
+ text=True,
99
+ env=env,
100
+ )
101
+ out_lines = []
102
+ assert p.stdout is not None
103
+ for line in p.stdout:
104
+ line = line.strip()
105
+ if not line:
106
+ continue
107
+ try:
108
+ out_lines.append(json.loads(line))
109
+ except json.JSONDecodeError:
110
+ # ignore stray prints from HF / torch
111
+ pass
112
+ p.wait(timeout=300)
113
+ return out_lines
114
+ finally:
115
+ try:
116
+ os.remove(path)
117
+ except OSError:
118
+ pass
119
+
120
+ @spaces.GPU(duration=180)
121
+ def profile_allocator(model_id: str, prompt: str, steps: int):
122
+ """Return a DataFrame ready for gr.LinePlot: t, MiB, kind, mode."""
123
+ on = _run_profile_once(model_id, prompt, steps, disable_cache=False)
124
+ off = _run_profile_once(model_id, prompt, steps, disable_cache=True)
125
+
126
+ def rows(series, mode):
127
+ for rec in series:
128
+ t = rec.get("t", 0)
129
+ allocated = rec.get("allocated", 0) / (1024**2)
130
+ reserved = rec.get("reserved", 0) / (1024**2)
131
+ yield {"t": t, "MiB": allocated, "kind": "allocated", "mode": mode}
132
+ yield {"t": t, "MiB": reserved, "kind": "reserved", "mode": mode}
133
+
134
+ df = pd.DataFrame(list(rows(on, "caching ON")) + list(rows(off, "caching OFF")))
135
+ return df
136
+
137
+ # --- UI ---
138
+ with gr.Blocks(fill_height=True) as demo:
139
+ gr.Markdown(
140
+ textwrap.dedent("""
141
+ ### Transformers feature showcase (ZeroGPU-ready)
142
+ - Attention mask visualizer
143
+ - Minimal terminal
144
+ - KV cache vs. CUDA caching allocator memory plot
145
+ """).strip()
146
+ )
147
+
148
+ with gr.Tabs():
149
+ with gr.Tab("Attention mask visualizer"):
150
+ with gr.Row():
151
+ model_dd = gr.Dropdown(
152
+ label="Model",
153
+ choices=[
154
+ "openai-community/gpt2",
155
+ "google/gemma-2-2b", # heavier; try if bandwidth allows
156
+ ],
157
+ value="openai-community/gpt2",
158
+ allow_custom_value=True,
159
+ )
160
+ prompt_tb = gr.Textbox(label="Prompt", value="You are an assistant. Make sure you print me.")
161
+ run_btn = gr.Button("Render")
162
+ html_out = gr.HTML()
163
+ run_btn.click(run_attention_visualizer, inputs=[model_dd, prompt_tb], outputs=html_out)
164
+
165
+ with gr.Tab("Terminal (simplified)"):
166
+ cmd = gr.Textbox(label="Command", value="python -c 'import torch; print(torch.__version__)'")
167
+ run_b = gr.Button("Run")
168
+ out = gr.Textbox(label="Output", lines=18, interactive=False)
169
+ run_b.click(run_shell, inputs=cmd, outputs=out)
170
+
171
+ with gr.Tab("Cache allocator plot"):
172
+ with gr.Row():
173
+ model_mem = gr.Dropdown(
174
+ label="Model",
175
+ choices=["openai-community/gpt2"],
176
+ value="openai-community/gpt2",
177
+ allow_custom_value=True,
178
+ )
179
+ prompt_mem = gr.Textbox(label="Prompt", value="A short test prompt.")
180
+ steps = gr.Slider(8, 128, value=32, step=1, label="Steps (tokens)")
181
+ go = gr.Button("Profile")
182
+ df_out = gr.Dataframe(visible=False) # optional debugging
183
+ plot = gr.LinePlot(
184
+ x="t", y="MiB", color="mode", overlay_point=True,
185
+ title="GPU memory over steps (allocated vs reserved; caching ON vs OFF)",
186
+ group="kind", tooltip=["t", "MiB", "kind", "mode"], width=900, height=450
187
+ )
188
+ go.click(profile_allocator, inputs=[model_mem, prompt_mem, steps], outputs=plot)
189
+
190
+ # Placeholder for a future FastRTC tab; the Space structure supports it.
191
+ # See: https://www.gradio.app/guides/create-immersive-demo (WebRTC + Stream with FastRTC)
192
+
193
+ if __name__ == "__main__":
194
+ demo.launch()
195
+