Spaces:
Running
Running
| <html lang="en"> | |
| <head> | |
| <meta charset="utf-8" /> | |
| <meta name="viewport" content="width=device-width, initial-scale=1" /> | |
| <title>TokenVisualizer — Minimal</title> | |
| <link rel="preconnect" href="https://fonts.googleapis.com"> | |
| <link href="https://fonts.googleapis.com/css2?family=Inter:wght@400;600&family=JetBrains+Mono:wght@400;600&display=swap" rel="stylesheet"> | |
| <style> | |
| :root{ | |
| --bg:#0b0f14; --text:#ffffff; --muted:#9aa4b2; --accent:#38bdf8; --border:#1f2a3a; | |
| --card1:#0c1624; --card2:#0a1220; --chip:#111827; --chip-border:#263246; --chip-hover:#1a2434; | |
| --mono:'JetBrains Mono',ui-monospace,Menlo,Consolas,monospace; --sans:Inter,system-ui,-apple-system,"Segoe UI",Roboto,Ubuntu,"Helvetica Neue",Arial; | |
| } | |
| *{box-sizing:border-box} body{margin:0;background:radial-gradient(900px 500px at 10% -10%, #07314a, transparent),var(--bg);color:var(--text);font-family:var(--sans)} | |
| .container{max-width:1100px;margin:0 auto;padding:1.25rem} | |
| header{padding-top:1.5rem} h1{margin:.2rem 0 .4rem;font-size:1.9rem} | |
| .sub{color:var(--muted);margin:.25rem 0 1rem} | |
| .card{background:linear-gradient(180deg,var(--card1),var(--card2));border:1px solid var(--border);border-radius:14px;padding:1rem;box-shadow:0 10px 40px rgba(0,0,0,.35)} | |
| label span{color:var(--muted);font-size:.9rem} | |
| select,textarea{width:100%;border-radius:10px;border:1px solid var(--border);background:#0a1220;color:var(--text);padding:.7rem .85rem;outline:none} | |
| select:focus,textarea:focus{border-color:var(--accent)} | |
| .controls{display:grid;gap:.8rem;margin-bottom:1rem} | |
| .row{display:flex;gap:.75rem;align-items:center} | |
| .status{color:var(--muted)} | |
| .grid{display:grid;gap:1rem;grid-template-columns:1fr} | |
| @media (min-width:900px){.grid{grid-template-columns:1fr 1fr}} | |
| .head{display:block;align-items:center;justify-content:space-between;margin-bottom:0rem} | |
| .tokens{display:flex;flex-wrap:wrap;gap:.5rem;max-height:360px;overflow:auto;padding:.25rem} | |
| .chip{border:1px solid var(--chip-border);background:var(--chip);padding:.35rem .5rem;border-radius:10px;font-family:var(--mono);font-size:.9rem;transition:background .12s,border-color .12s} | |
| .chip:hover{background:var(--chip-hover);border-color:var(--accent)} | |
| .chip.active{outline:2px solid var(--accent)} | |
| .chip.special {border-color: #38bdf8;background: #0b2235;} | |
| pre.ids{font-family:var(--mono);background:#0a1220;border:1px solid var(--border);border-radius:10px;padding:.75rem;max-height:360px;overflow:auto;white-space:pre-wrap} | |
| .caption{color:var(--muted);font-size:.9rem;margin-top:0rem;margin-bottom:.75rem;} | |
| footer{color:var(--muted);text-align:center;padding:1.25rem 0 2rem} | |
| a{color:var(--accent)} | |
| </style> | |
| </head> | |
| <body> | |
| <header class="container"> | |
| <h1>Token Visualizer</h1> | |
| <p class="sub">Enter any text and see how AI turns it into tokens and IDs, the building blocks of its thinking.</p> | |
| </header> | |
| <main class="container"> | |
| <section class="card controls"> | |
| <label> | |
| <span>Model</span> | |
| <select id="model"> | |
| <!-- Tip: keep this first so the demo works instantly once you upload /assets/gpt2/* --> | |
| <option value="local:gpt2">GPT-2 (local, fast)</option> | |
| <option value="Xenova/llama2-tokenizer">Llama-2 (Hub)</option> | |
| <option value="Xenova/mistral-tokenizer">Mistral (Hub)</option> | |
| <option value="Xenova/gemma-tokenizer">Gemma (Hub)</option> | |
| <option value="Xenova/bert-base-uncased">BERT Base Uncased (Hub)</option> | |
| </select> | |
| </label> | |
| <label> | |
| <span>Text</span> | |
| <textarea id="input" rows="3">Curiosity propelled the cat to unfathomable heights.</textarea> | |
| </label> | |
| <div class="row"> | |
| <span id="status" class="status">Loading tokenizer…</span> | |
| </div> | |
| </section> | |
| <section class="grid"> | |
| <article class="card"> | |
| <div class="head"><h3>Tokens</h3></div> | |
| <p class="caption">The smallest language units the model works with.</p> | |
| <div id="tokens" class="tokens"></div> | |
| </article> | |
| <article class="card"> | |
| <div class="head"><h3>Token IDs</h3></div> | |
| <p class="caption">Their numeric form inside the model’s computations.</p> | |
| <pre id="ids" class="ids"></pre> | |
| </article> | |
| </section> | |
| </main> | |
| <footer class="container"> | |
| <small>Built by Peter Adams • Powered in your browser by <a href="https://github.com/xenova/transformers.js" target="_blank" rel="noreferrer">Transformers.js</a></small> | |
| </footer> | |
| <!-- Minimal, robust script (no copy/export) --> | |
| <script type="module"> | |
| // Prefer keeping all requests on huggingface.co to avoid CORS/VPN issues. | |
| // Option 1 (simple): CDN import (works on many networks) | |
| const tf = await import('https://cdn.jsdelivr.net/npm/@xenova/transformers@2.17.2'); | |
| // Option 2 (bulletproof): self-host the file in your Space and use: | |
| // const tf = await import('./assets/vendor/transformers.min.js'); | |
| tf.env.useBrowserCache = true; | |
| tf.env.allowLocalModels = true; // <-- REQUIRED for local folder paths | |
| tf.env.localModelPath = '/'; | |
| const $ = s => document.querySelector(s); | |
| const modelSel = $('#model'); | |
| const inputEl = $('#input'); | |
| const statusEl = $('#status'); | |
| const tokensEl = $('#tokens'); | |
| const idsEl = $('#ids'); | |
| // Single state object; never reassign | |
| const state = { tokens: [], ids: [] }; | |
| let tokenizer = null; | |
| let runId = 0; | |
| const status = (msg) => { statusEl.textContent = msg; }; | |
| const debounce = (fn, ms=200) => { let t; return (...a)=>{ clearTimeout(t); t=setTimeout(()=>fn(...a), ms); }; }; | |
| async function loadTokenizer(modelId){ | |
| status('Loading tokenizer…'); | |
| try { | |
| if (modelId === 'local:gpt2') { | |
| // Note: no double slashes, no /resolve/main – just your folder. | |
| tokenizer = await tf.AutoTokenizer.from_pretrained('assets/gpt2'); | |
| } else { | |
| tokenizer = await tf.AutoTokenizer.from_pretrained(modelId); | |
| } | |
| status('Tokenizer ready.'); | |
| } catch (e) { | |
| console.error('Tokenizer load failed:', e); | |
| tokenizer = null; | |
| status('Failed to load tokenizer (network blocked or slow). Try GPT-2 or a different VPN route.'); | |
| } | |
| } | |
| async function tokenize(){ | |
| const myRun = ++runId; | |
| if (!tokenizer) { | |
| await loadTokenizer(modelSel.value); | |
| if (!tokenizer) { render(); return; } | |
| } | |
| // Make sure we always pass a string to encode() | |
| const text = String(inputEl.value ?? '').trim(); | |
| if (!text) { | |
| state.tokens = []; | |
| state.ids = []; | |
| render(); | |
| status('Type to tokenize…'); | |
| return; | |
| } | |
| status('Tokenizing…'); | |
| try { | |
| const enc = await tokenizer.encode(text); // include specials (default) | |
| // Handle both array/object return shapes | |
| const ids = Array.isArray(enc) | |
| ? enc | |
| : (enc && (enc.ids ?? enc.input_ids ?? enc.inputIds)) || []; | |
| // Map special IDs -> special token strings (if available) | |
| const specialIds = Array.from(tokenizer.all_special_ids || []); | |
| const specialTokens = Array.from(tokenizer.all_special_tokens || []); | |
| const idToSpecial = new Map(specialIds.map((id, i) => [id, specialTokens[i]])); | |
| // Build token strings for every ID (specials included) | |
| let tokens = []; | |
| if (typeof tokenizer.convert_ids_to_tokens === 'function') { | |
| tokens = tokenizer.convert_ids_to_tokens(ids); | |
| } else if (typeof tokenizer.id_to_token === 'function') { | |
| tokens = ids.map(id => tokenizer.id_to_token(id)); | |
| } else if (!Array.isArray(enc) && Array.isArray(enc.tokens)) { | |
| tokens = enc.tokens; | |
| } else { | |
| // Fallback: decode each ID as a single-piece token | |
| tokens = ids.map(id => | |
| tokenizer.decode([id], { | |
| // we WANT specials in the stream; decode may return "" for them | |
| skip_special_tokens: false, | |
| clean_up_tokenization_spaces: false, | |
| }) | |
| ); | |
| } | |
| // Ensure specials are visible: if a special token decodes to empty, | |
| // replace it with its canonical name or a generic tag. | |
| tokens = tokens.map((tok, i) => { | |
| const id = ids[i]; | |
| if (tok && tok.length) return tok; | |
| if (idToSpecial.has(id)) return idToSpecial.get(id); // e.g., <|endoftext|> for GPT-2 | |
| return `<special:${id}>`; | |
| }); | |
| if (myRun !== runId) return; | |
| state.tokens = tokens; | |
| state.ids = ids; // include specials in the count | |
| render(); | |
| status(`Done. ${state.tokens.length} tokens.`); | |
| } catch (e) { | |
| console.error('Tokenize failed:', e); | |
| render(); | |
| status('Error tokenizing. See console.'); | |
| } | |
| } | |
| function render(){ | |
| const tokens = Array.isArray(state.tokens) ? state.tokens : []; | |
| const ids = Array.isArray(state.ids) ? state.ids : []; | |
| const specialSet = new Set(tokenizer.all_special_ids || []); | |
| tokensEl.innerHTML = ''; | |
| tokens.forEach((tok, i) => { | |
| const chip = document.createElement('span'); | |
| chip.className = 'chip'; | |
| if (specialSet.has(ids[i])) chip.classList.add('special'); // <-- highlight specials | |
| chip.dataset.i = i; | |
| chip.textContent = tok; | |
| chip.addEventListener('mouseenter', ()=>highlight(i, true)); | |
| chip.addEventListener('mouseleave', ()=>highlight(i, false)); | |
| tokensEl.appendChild(chip); | |
| }); | |
| idsEl.textContent = ids.join(' '); | |
| if (tokens.length === 0) status('Type to tokenize…'); | |
| } | |
| function highlight(i, on){ | |
| const ids = Array.isArray(state.ids) ? state.ids : []; | |
| if (!ids.length) return; | |
| const parts = ids.map((id, idx) => (idx === i && on) ? `[${id}]` : String(id)); | |
| idsEl.textContent = parts.join(' '); | |
| const chip = tokensEl.querySelector(`[data-i="${i}"]`); | |
| if (chip) chip.classList.toggle('active', on); | |
| } | |
| const debounced = debounce(tokenize, 200); | |
| inputEl.addEventListener('input', debounced); | |
| modelSel.addEventListener('change', async ()=>{ | |
| tokenizer = null; // force reload | |
| await loadTokenizer(modelSel.value); | |
| tokenize(); | |
| }); | |
| // Initial load | |
| await loadTokenizer(modelSel.value); | |
| tokenize(); | |
| </script> | |
| </body> | |
| </html> | |