base app
Browse files
app.py
ADDED
|
@@ -0,0 +1,195 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
import os, json, subprocess, sys, textwrap, tempfile, shlex, pandas as pd
|
| 2 |
+
|
| 3 |
+
import gradio as gr
|
| 4 |
+
import spaces
|
| 5 |
+
|
| 6 |
+
# --- Attention mask visualizer (Transformers) ---
|
| 7 |
+
# Docs show: from transformers.utils.attention_visualizer import AttentionMaskVisualizer
|
| 8 |
+
# Ref: https://huggingface.co/docs/transformers/... pages mention this util.
|
| 9 |
+
def _import_visualizer():
|
| 10 |
+
from transformers.utils.attention_visualizer import AttentionMaskVisualizer # type: ignore[attr-defined]
|
| 11 |
+
return AttentionMaskVisualizer
|
| 12 |
+
|
| 13 |
+
@spaces.GPU(duration=120)
|
| 14 |
+
def run_attention_visualizer(model_id: str, prompt: str) -> str:
|
| 15 |
+
"""
|
| 16 |
+
Returns HTML produced by AttentionMaskVisualizer(model_id)(prompt).
|
| 17 |
+
We render it into an HTML component.
|
| 18 |
+
"""
|
| 19 |
+
AttentionMaskVisualizer = _import_visualizer()
|
| 20 |
+
vis = AttentionMaskVisualizer(model_id)
|
| 21 |
+
html_or_obj = vis(prompt) # recent Transformers returns embeddable HTML
|
| 22 |
+
return str(html_or_obj)
|
| 23 |
+
|
| 24 |
+
# --- Minimal “terminal” (sandboxed) ---
|
| 25 |
+
def run_shell(cmd: str) -> str:
|
| 26 |
+
# Simple, constrained shell: block backgrounding, pipes, redirects; allow common tooling.
|
| 27 |
+
blocked = any(tok in cmd for tok in ["|", ">", "<", "&&", "||", "`"])
|
| 28 |
+
if blocked:
|
| 29 |
+
return "Blocked characters detected. Use a single command without pipes/redirections."
|
| 30 |
+
try:
|
| 31 |
+
out = subprocess.run(
|
| 32 |
+
cmd, shell=True, check=False, capture_output=True, text=True, timeout=30
|
| 33 |
+
)
|
| 34 |
+
return f"$ {cmd}\n{out.stdout}{out.stderr}"
|
| 35 |
+
except Exception as e:
|
| 36 |
+
return f"$ {cmd}\n{e!r}"
|
| 37 |
+
|
| 38 |
+
# --- KV-cache / CUDA caching allocator profiling ---
|
| 39 |
+
# We launch a short Python program twice (allocator on/off) in a subprocess so the env var takes effect pre-import.
|
| 40 |
+
PROFILE_SNIPPET = r"""
|
| 41 |
+
import os, json, time, torch
|
| 42 |
+
from transformers import AutoTokenizer, AutoModelForCausalLM
|
| 43 |
+
|
| 44 |
+
model_id = os.environ.get("HF_MODEL_ID", "openai-community/gpt2")
|
| 45 |
+
device = "cuda" if torch.cuda.is_available() else "cpu"
|
| 46 |
+
tok = AutoTokenizer.from_pretrained(model_id)
|
| 47 |
+
model = AutoModelForCausalLM.from_pretrained(model_id, torch_dtype=torch.float16 if device=="cuda" else None).to(device)
|
| 48 |
+
|
| 49 |
+
prompt = os.environ.get("HF_PROMPT", "Transformers are great for sequence modeling.")
|
| 50 |
+
steps = int(os.environ.get("HF_STEPS", "32"))
|
| 51 |
+
|
| 52 |
+
inputs = tok(prompt, return_tensors="pt").to(device)
|
| 53 |
+
if device == "cuda":
|
| 54 |
+
torch.cuda.reset_peak_memory_stats()
|
| 55 |
+
torch.cuda.synchronize()
|
| 56 |
+
|
| 57 |
+
def mem():
|
| 58 |
+
if device != "cuda":
|
| 59 |
+
return {"allocated": 0, "reserved": 0}
|
| 60 |
+
return {
|
| 61 |
+
"allocated": int(torch.cuda.memory_allocated()),
|
| 62 |
+
"reserved": int(torch.cuda.memory_reserved()),
|
| 63 |
+
}
|
| 64 |
+
|
| 65 |
+
print(json.dumps({"t": 0, **mem()}), flush=True)
|
| 66 |
+
|
| 67 |
+
# Step-by-step generation to grow KV cache
|
| 68 |
+
past = None
|
| 69 |
+
input_ids = inputs.input_ids
|
| 70 |
+
for i in range(1, steps+1):
|
| 71 |
+
with torch.inference_mode():
|
| 72 |
+
out = model(input_ids=input_ids, use_cache=True, past_key_values=past)
|
| 73 |
+
past = out.past_key_values
|
| 74 |
+
# feed a single token next (use eos or last predicted token if available)
|
| 75 |
+
next_id = torch.tensor([[tok.eos_token_id or tok.pad_token_id or 0]], device=device)
|
| 76 |
+
input_ids = next_id
|
| 77 |
+
if device == "cuda":
|
| 78 |
+
torch.cuda.synchronize()
|
| 79 |
+
print(json.dumps({"t": i, **mem()}), flush=True)
|
| 80 |
+
"""
|
| 81 |
+
|
| 82 |
+
def _run_profile_once(model_id: str, prompt: str, steps: int, disable_cache: bool) -> list[dict]:
|
| 83 |
+
env = os.environ.copy()
|
| 84 |
+
env["HF_MODEL_ID"] = model_id
|
| 85 |
+
env["HF_PROMPT"] = prompt
|
| 86 |
+
env["HF_STEPS"] = str(steps)
|
| 87 |
+
# IMPORTANT: set before torch import in the child
|
| 88 |
+
if disable_cache:
|
| 89 |
+
env["PYTORCH_NO_CUDA_MEMORY_CACHING"] = "1"
|
| 90 |
+
with tempfile.NamedTemporaryFile("w", suffix=".py", delete=False) as f:
|
| 91 |
+
f.write(PROFILE_SNIPPET)
|
| 92 |
+
path = f.name
|
| 93 |
+
try:
|
| 94 |
+
p = subprocess.Popen(
|
| 95 |
+
[sys.executable, path],
|
| 96 |
+
stdout=subprocess.PIPE,
|
| 97 |
+
stderr=subprocess.PIPE,
|
| 98 |
+
text=True,
|
| 99 |
+
env=env,
|
| 100 |
+
)
|
| 101 |
+
out_lines = []
|
| 102 |
+
assert p.stdout is not None
|
| 103 |
+
for line in p.stdout:
|
| 104 |
+
line = line.strip()
|
| 105 |
+
if not line:
|
| 106 |
+
continue
|
| 107 |
+
try:
|
| 108 |
+
out_lines.append(json.loads(line))
|
| 109 |
+
except json.JSONDecodeError:
|
| 110 |
+
# ignore stray prints from HF / torch
|
| 111 |
+
pass
|
| 112 |
+
p.wait(timeout=300)
|
| 113 |
+
return out_lines
|
| 114 |
+
finally:
|
| 115 |
+
try:
|
| 116 |
+
os.remove(path)
|
| 117 |
+
except OSError:
|
| 118 |
+
pass
|
| 119 |
+
|
| 120 |
+
@spaces.GPU(duration=180)
|
| 121 |
+
def profile_allocator(model_id: str, prompt: str, steps: int):
|
| 122 |
+
"""Return a DataFrame ready for gr.LinePlot: t, MiB, kind, mode."""
|
| 123 |
+
on = _run_profile_once(model_id, prompt, steps, disable_cache=False)
|
| 124 |
+
off = _run_profile_once(model_id, prompt, steps, disable_cache=True)
|
| 125 |
+
|
| 126 |
+
def rows(series, mode):
|
| 127 |
+
for rec in series:
|
| 128 |
+
t = rec.get("t", 0)
|
| 129 |
+
allocated = rec.get("allocated", 0) / (1024**2)
|
| 130 |
+
reserved = rec.get("reserved", 0) / (1024**2)
|
| 131 |
+
yield {"t": t, "MiB": allocated, "kind": "allocated", "mode": mode}
|
| 132 |
+
yield {"t": t, "MiB": reserved, "kind": "reserved", "mode": mode}
|
| 133 |
+
|
| 134 |
+
df = pd.DataFrame(list(rows(on, "caching ON")) + list(rows(off, "caching OFF")))
|
| 135 |
+
return df
|
| 136 |
+
|
| 137 |
+
# --- UI ---
|
| 138 |
+
with gr.Blocks(fill_height=True) as demo:
|
| 139 |
+
gr.Markdown(
|
| 140 |
+
textwrap.dedent("""
|
| 141 |
+
### Transformers feature showcase (ZeroGPU-ready)
|
| 142 |
+
- Attention mask visualizer
|
| 143 |
+
- Minimal terminal
|
| 144 |
+
- KV cache vs. CUDA caching allocator memory plot
|
| 145 |
+
""").strip()
|
| 146 |
+
)
|
| 147 |
+
|
| 148 |
+
with gr.Tabs():
|
| 149 |
+
with gr.Tab("Attention mask visualizer"):
|
| 150 |
+
with gr.Row():
|
| 151 |
+
model_dd = gr.Dropdown(
|
| 152 |
+
label="Model",
|
| 153 |
+
choices=[
|
| 154 |
+
"openai-community/gpt2",
|
| 155 |
+
"google/gemma-2-2b", # heavier; try if bandwidth allows
|
| 156 |
+
],
|
| 157 |
+
value="openai-community/gpt2",
|
| 158 |
+
allow_custom_value=True,
|
| 159 |
+
)
|
| 160 |
+
prompt_tb = gr.Textbox(label="Prompt", value="You are an assistant. Make sure you print me.")
|
| 161 |
+
run_btn = gr.Button("Render")
|
| 162 |
+
html_out = gr.HTML()
|
| 163 |
+
run_btn.click(run_attention_visualizer, inputs=[model_dd, prompt_tb], outputs=html_out)
|
| 164 |
+
|
| 165 |
+
with gr.Tab("Terminal (simplified)"):
|
| 166 |
+
cmd = gr.Textbox(label="Command", value="python -c 'import torch; print(torch.__version__)'")
|
| 167 |
+
run_b = gr.Button("Run")
|
| 168 |
+
out = gr.Textbox(label="Output", lines=18, interactive=False)
|
| 169 |
+
run_b.click(run_shell, inputs=cmd, outputs=out)
|
| 170 |
+
|
| 171 |
+
with gr.Tab("Cache allocator plot"):
|
| 172 |
+
with gr.Row():
|
| 173 |
+
model_mem = gr.Dropdown(
|
| 174 |
+
label="Model",
|
| 175 |
+
choices=["openai-community/gpt2"],
|
| 176 |
+
value="openai-community/gpt2",
|
| 177 |
+
allow_custom_value=True,
|
| 178 |
+
)
|
| 179 |
+
prompt_mem = gr.Textbox(label="Prompt", value="A short test prompt.")
|
| 180 |
+
steps = gr.Slider(8, 128, value=32, step=1, label="Steps (tokens)")
|
| 181 |
+
go = gr.Button("Profile")
|
| 182 |
+
df_out = gr.Dataframe(visible=False) # optional debugging
|
| 183 |
+
plot = gr.LinePlot(
|
| 184 |
+
x="t", y="MiB", color="mode", overlay_point=True,
|
| 185 |
+
title="GPU memory over steps (allocated vs reserved; caching ON vs OFF)",
|
| 186 |
+
group="kind", tooltip=["t", "MiB", "kind", "mode"], width=900, height=450
|
| 187 |
+
)
|
| 188 |
+
go.click(profile_allocator, inputs=[model_mem, prompt_mem, steps], outputs=plot)
|
| 189 |
+
|
| 190 |
+
# Placeholder for a future FastRTC tab; the Space structure supports it.
|
| 191 |
+
# See: https://www.gradio.app/guides/create-immersive-demo (WebRTC + Stream with FastRTC)
|
| 192 |
+
|
| 193 |
+
if __name__ == "__main__":
|
| 194 |
+
demo.launch()
|
| 195 |
+
|