Spaces:

transformers-community
/

Transformers-tenets

Running

App Files Files Community

Molbap HF Staff commited on Aug 19

Commit

0ce686a

1 Parent(s): 2f8bacd

base app

Browse files

Files changed (1) hide show

app.py +195 -0

app.py ADDED Viewed

	@@ -0,0 +1,195 @@

+import os, json, subprocess, sys, textwrap, tempfile, shlex, pandas as pd
+import gradio as gr
+import spaces
+# --- Attention mask visualizer (Transformers) ---
+# Docs show: from transformers.utils.attention_visualizer import AttentionMaskVisualizer
+# Ref: https://huggingface.co/docs/transformers/... pages mention this util.
+def _import_visualizer():
+    from transformers.utils.attention_visualizer import AttentionMaskVisualizer  # type: ignore[attr-defined]
+    return AttentionMaskVisualizer
+@spaces.GPU(duration=120)
+def run_attention_visualizer(model_id: str, prompt: str) -> str:
+    """
+    Returns HTML produced by AttentionMaskVisualizer(model_id)(prompt).
+    We render it into an HTML component.
+    """
+    AttentionMaskVisualizer = _import_visualizer()
+    vis = AttentionMaskVisualizer(model_id)
+    html_or_obj = vis(prompt)  # recent Transformers returns embeddable HTML
+    return str(html_or_obj)
+# --- Minimal “terminal” (sandboxed) ---
+def run_shell(cmd: str) -> str:
+    # Simple, constrained shell: block backgrounding, pipes, redirects; allow common tooling.
+    blocked = any(tok in cmd for tok in ["|", ">", "<", "&&", "||", "`"])
+    if blocked:
+        return "Blocked characters detected. Use a single command without pipes/redirections."
+    try:
+        out = subprocess.run(
+            cmd, shell=True, check=False, capture_output=True, text=True, timeout=30
+        )
+        return f"$ {cmd}\n{out.stdout}{out.stderr}"
+    except Exception as e:
+        return f"$ {cmd}\n{e!r}"
+# --- KV-cache / CUDA caching allocator profiling ---
+# We launch a short Python program twice (allocator on/off) in a subprocess so the env var takes effect pre-import.
+PROFILE_SNIPPET = r"""
+import os, json, time, torch
+from transformers import AutoTokenizer, AutoModelForCausalLM
+model_id = os.environ.get("HF_MODEL_ID", "openai-community/gpt2")
+device = "cuda" if torch.cuda.is_available() else "cpu"
+tok = AutoTokenizer.from_pretrained(model_id)
+model = AutoModelForCausalLM.from_pretrained(model_id, torch_dtype=torch.float16 if device=="cuda" else None).to(device)
+prompt = os.environ.get("HF_PROMPT", "Transformers are great for sequence modeling.")
+steps = int(os.environ.get("HF_STEPS", "32"))
+inputs = tok(prompt, return_tensors="pt").to(device)
+if device == "cuda":
+    torch.cuda.reset_peak_memory_stats()
+    torch.cuda.synchronize()
+def mem():
+    if device != "cuda":
+        return {"allocated": 0, "reserved": 0}
+    return {
+        "allocated": int(torch.cuda.memory_allocated()),
+        "reserved": int(torch.cuda.memory_reserved()),
+    }
+print(json.dumps({"t": 0, **mem()}), flush=True)
+# Step-by-step generation to grow KV cache
+past = None
+input_ids = inputs.input_ids
+for i in range(1, steps+1):
+    with torch.inference_mode():
+        out = model(input_ids=input_ids, use_cache=True, past_key_values=past)
+        past = out.past_key_values
+        # feed a single token next (use eos or last predicted token if available)
+        next_id = torch.tensor([[tok.eos_token_id or tok.pad_token_id or 0]], device=device)
+        input_ids = next_id
+    if device == "cuda":
+        torch.cuda.synchronize()
+    print(json.dumps({"t": i, **mem()}), flush=True)
+"""
+def _run_profile_once(model_id: str, prompt: str, steps: int, disable_cache: bool) -> list[dict]:
+    env = os.environ.copy()
+    env["HF_MODEL_ID"] = model_id
+    env["HF_PROMPT"] = prompt
+    env["HF_STEPS"] = str(steps)
+    # IMPORTANT: set before torch import in the child
+    if disable_cache:
+        env["PYTORCH_NO_CUDA_MEMORY_CACHING"] = "1"
+    with tempfile.NamedTemporaryFile("w", suffix=".py", delete=False) as f:
+        f.write(PROFILE_SNIPPET)
+        path = f.name
+    try:
+        p = subprocess.Popen(
+            [sys.executable, path],
+            stdout=subprocess.PIPE,
+            stderr=subprocess.PIPE,
+            text=True,
+            env=env,
+        )
+        out_lines = []
+        assert p.stdout is not None
+        for line in p.stdout:
+            line = line.strip()
+            if not line:
+                continue
+            try:
+                out_lines.append(json.loads(line))
+            except json.JSONDecodeError:
+                # ignore stray prints from HF / torch
+                pass
+        p.wait(timeout=300)
+        return out_lines
+    finally:
+        try:
+            os.remove(path)
+        except OSError:
+            pass
+@spaces.GPU(duration=180)
+def profile_allocator(model_id: str, prompt: str, steps: int):
+    """Return a DataFrame ready for gr.LinePlot: t, MiB, kind, mode."""
+    on = _run_profile_once(model_id, prompt, steps, disable_cache=False)
+    off = _run_profile_once(model_id, prompt, steps, disable_cache=True)
+    def rows(series, mode):
+        for rec in series:
+            t = rec.get("t", 0)
+            allocated = rec.get("allocated", 0) / (1024**2)
+            reserved  = rec.get("reserved", 0) / (1024**2)
+            yield {"t": t, "MiB": allocated, "kind": "allocated", "mode": mode}
+            yield {"t": t, "MiB": reserved,  "kind": "reserved",  "mode": mode}
+    df = pd.DataFrame(list(rows(on, "caching ON")) + list(rows(off, "caching OFF")))
+    return df
+# --- UI ---
+with gr.Blocks(fill_height=True) as demo:
+    gr.Markdown(
+        textwrap.dedent("""
+        ### Transformers feature showcase (ZeroGPU-ready)
+        - Attention mask visualizer
+        - Minimal terminal
+        - KV cache vs. CUDA caching allocator memory plot
+        """).strip()
+    )
+    with gr.Tabs():
+        with gr.Tab("Attention mask visualizer"):
+            with gr.Row():
+                model_dd = gr.Dropdown(
+                    label="Model",
+                    choices=[
+                        "openai-community/gpt2",
+                        "google/gemma-2-2b",  # heavier; try if bandwidth allows
+                    ],
+                    value="openai-community/gpt2",
+                    allow_custom_value=True,
+                )
+                prompt_tb = gr.Textbox(label="Prompt", value="You are an assistant. Make sure you print me.")
+                run_btn = gr.Button("Render")
+            html_out = gr.HTML()
+            run_btn.click(run_attention_visualizer, inputs=[model_dd, prompt_tb], outputs=html_out)
+        with gr.Tab("Terminal (simplified)"):
+            cmd = gr.Textbox(label="Command", value="python -c 'import torch; print(torch.__version__)'")
+            run_b = gr.Button("Run")
+            out = gr.Textbox(label="Output", lines=18, interactive=False)
+            run_b.click(run_shell, inputs=cmd, outputs=out)
+        with gr.Tab("Cache allocator plot"):
+            with gr.Row():
+                model_mem = gr.Dropdown(
+                    label="Model",
+                    choices=["openai-community/gpt2"],
+                    value="openai-community/gpt2",
+                    allow_custom_value=True,
+                )
+                prompt_mem = gr.Textbox(label="Prompt", value="A short test prompt.")
+                steps = gr.Slider(8, 128, value=32, step=1, label="Steps (tokens)")
+                go = gr.Button("Profile")
+            df_out = gr.Dataframe(visible=False)  # optional debugging
+            plot = gr.LinePlot(
+                x="t", y="MiB", color="mode", overlay_point=True,
+                title="GPU memory over steps (allocated vs reserved; caching ON vs OFF)",
+                group="kind", tooltip=["t", "MiB", "kind", "mode"], width=900, height=450
+            )
+            go.click(profile_allocator, inputs=[model_mem, prompt_mem, steps], outputs=plot)
+        # Placeholder for a future FastRTC tab; the Space structure supports it.
+        # See: https://www.gradio.app/guides/create-immersive-demo (WebRTC + Stream with FastRTC)
+if __name__ == "__main__":
+    demo.launch()