TokenVisualizer

Running

App Files Files Community

TokenVisualizer / index.html

PeterPinetree

Update index.html

38a238d verified 3 months ago

raw

history blame contribute delete

10.6 kB

	<!doctype html>
	<html lang="en">
	<head>
	<meta charset="utf-8" />
	<meta name="viewport" content="width=device-width, initial-scale=1" />
	<title>TokenVisualizer — Minimal</title>
	<link rel="preconnect" href="https://fonts.googleapis.com">
	<link href="https://fonts.googleapis.com/css2?family=Inter:wght@400;600&family=JetBrains+Mono:wght@400;600&display=swap" rel="stylesheet">
	<style>
	:root{
	--bg:#0b0f14; --text:#ffffff; --muted:#9aa4b2; --accent:#38bdf8; --border:#1f2a3a;
	--card1:#0c1624; --card2:#0a1220; --chip:#111827; --chip-border:#263246; --chip-hover:#1a2434;
	--mono:'JetBrains Mono',ui-monospace,Menlo,Consolas,monospace; --sans:Inter,system-ui,-apple-system,"Segoe UI",Roboto,Ubuntu,"Helvetica Neue",Arial;
	}
	*{box-sizing:border-box} body{margin:0;background:radial-gradient(900px 500px at 10% -10%, #07314a, transparent),var(--bg);color:var(--text);font-family:var(--sans)}
	.container{max-width:1100px;margin:0 auto;padding:1.25rem}
	header{padding-top:1.5rem} h1{margin:.2rem 0 .4rem;font-size:1.9rem}
	.sub{color:var(--muted);margin:.25rem 0 1rem}
	.card{background:linear-gradient(180deg,var(--card1),var(--card2));border:1px solid var(--border);border-radius:14px;padding:1rem;box-shadow:0 10px 40px rgba(0,0,0,.35)}
	label span{color:var(--muted);font-size:.9rem}
	select,textarea{width:100%;border-radius:10px;border:1px solid var(--border);background:#0a1220;color:var(--text);padding:.7rem .85rem;outline:none}
	select:focus,textarea:focus{border-color:var(--accent)}
	.controls{display:grid;gap:.8rem;margin-bottom:1rem}
	.row{display:flex;gap:.75rem;align-items:center}
	.status{color:var(--muted)}
	.grid{display:grid;gap:1rem;grid-template-columns:1fr}
	@media (min-width:900px){.grid{grid-template-columns:1fr 1fr}}
	.head{display:block;align-items:center;justify-content:space-between;margin-bottom:0rem}
	.tokens{display:flex;flex-wrap:wrap;gap:.5rem;max-height:360px;overflow:auto;padding:.25rem}
	.chip{border:1px solid var(--chip-border);background:var(--chip);padding:.35rem .5rem;border-radius:10px;font-family:var(--mono);font-size:.9rem;transition:background .12s,border-color .12s}
	.chip:hover{background:var(--chip-hover);border-color:var(--accent)}
	.chip.active{outline:2px solid var(--accent)}
	.chip.special {border-color: #38bdf8;background: #0b2235;}
	pre.ids{font-family:var(--mono);background:#0a1220;border:1px solid var(--border);border-radius:10px;padding:.75rem;max-height:360px;overflow:auto;white-space:pre-wrap}
	.caption{color:var(--muted);font-size:.9rem;margin-top:0rem;margin-bottom:.75rem;}
	footer{color:var(--muted);text-align:center;padding:1.25rem 0 2rem}
	a{color:var(--accent)}
	</style>
	</head>
	<body>
	<header class="container">
	<h1>Token Visualizer</h1>
	<p class="sub">Enter any text and see how AI turns it into tokens and IDs, the building blocks of its thinking.</p>
	</header>

	<main class="container">
	<section class="card controls">
	<label>
	<span>Model</span>
	<select id="model">
	<!-- Tip: keep this first so the demo works instantly once you upload /assets/gpt2/* -->
	<option value="local:gpt2">GPT-2 (local, fast)</option>
	<option value="Xenova/llama2-tokenizer">Llama-2 (Hub)</option>
	<option value="Xenova/mistral-tokenizer">Mistral (Hub)</option>
	<option value="Xenova/gemma-tokenizer">Gemma (Hub)</option>
	<option value="Xenova/bert-base-uncased">BERT Base Uncased (Hub)</option>
	</select>
	</label>
	<label>
	<span>Text</span>
	<textarea id="input" rows="3">Curiosity propelled the cat to unfathomable heights.</textarea>
	</label>
	<div class="row">
	<span id="status" class="status">Loading tokenizer…</span>
	</div>
	</section>

	<section class="grid">
	<article class="card">
	<div class="head"><h3>Tokens</h3></div>
	<p class="caption">The smallest language units the model works with.</p>
	<div id="tokens" class="tokens"></div>
	</article>

	<article class="card">
	<div class="head"><h3>Token IDs</h3></div>
	<p class="caption">Their numeric form inside the model’s computations.</p>
	<pre id="ids" class="ids"></pre>
	</article>
	</section>
	</main>

	<footer class="container">
	<small>Built by Peter Adams • Powered in your browser by <a href="https://github.com/xenova/transformers.js" target="_blank" rel="noreferrer">Transformers.js</a></small>
	</footer>

	<!-- Minimal, robust script (no copy/export) -->
	<script type="module">
	// Prefer keeping all requests on huggingface.co to avoid CORS/VPN issues.
	// Option 1 (simple): CDN import (works on many networks)
	const tf = await import('https://cdn.jsdelivr.net/npm/@xenova/transformers@2.17.2');
	// Option 2 (bulletproof): self-host the file in your Space and use:
	// const tf = await import('./assets/vendor/transformers.min.js');

	tf.env.useBrowserCache = true;
	tf.env.allowLocalModels = true; // <-- REQUIRED for local folder paths
	tf.env.localModelPath = '/';

	const $ = s => document.querySelector(s);
	const modelSel = $('#model');
	const inputEl = $('#input');
	const statusEl = $('#status');
	const tokensEl = $('#tokens');
	const idsEl = $('#ids');

	// Single state object; never reassign
	const state = { tokens: [], ids: [] };
	let tokenizer = null;
	let runId = 0;

	const status = (msg) => { statusEl.textContent = msg; };
	const debounce = (fn, ms=200) => { let t; return (...a)=>{ clearTimeout(t); t=setTimeout(()=>fn(...a), ms); }; };

	async function loadTokenizer(modelId){
	status('Loading tokenizer…');
	try {
	if (modelId === 'local:gpt2') {
	// Note: no double slashes, no /resolve/main – just your folder.
	tokenizer = await tf.AutoTokenizer.from_pretrained('assets/gpt2');
	} else {
	tokenizer = await tf.AutoTokenizer.from_pretrained(modelId);
	}
	status('Tokenizer ready.');
	} catch (e) {
	console.error('Tokenizer load failed:', e);
	tokenizer = null;
	status('Failed to load tokenizer (network blocked or slow). Try GPT-2 or a different VPN route.');
	}
	}

	async function tokenize(){
	const myRun = ++runId;

	if (!tokenizer) {
	await loadTokenizer(modelSel.value);
	if (!tokenizer) { render(); return; }
	}

	// Make sure we always pass a string to encode()
	const text = String(inputEl.value ?? '').trim();
	if (!text) {
	state.tokens = [];
	state.ids = [];
	render();
	status('Type to tokenize…');
	return;
	}

	status('Tokenizing…');
	try {
	const enc = await tokenizer.encode(text); // include specials (default)
	// Handle both array/object return shapes
	const ids = Array.isArray(enc)
	? enc
	: (enc && (enc.ids ?? enc.input_ids ?? enc.inputIds)) \|\| [];

	// Map special IDs -> special token strings (if available)
	const specialIds = Array.from(tokenizer.all_special_ids \|\| []);
	const specialTokens = Array.from(tokenizer.all_special_tokens \|\| []);
	const idToSpecial = new Map(specialIds.map((id, i) => [id, specialTokens[i]]));

	// Build token strings for every ID (specials included)
	let tokens = [];
	if (typeof tokenizer.convert_ids_to_tokens === 'function') {
	tokens = tokenizer.convert_ids_to_tokens(ids);
	} else if (typeof tokenizer.id_to_token === 'function') {
	tokens = ids.map(id => tokenizer.id_to_token(id));
	} else if (!Array.isArray(enc) && Array.isArray(enc.tokens)) {
	tokens = enc.tokens;
	} else {
	// Fallback: decode each ID as a single-piece token
	tokens = ids.map(id =>
	tokenizer.decode([id], {
	// we WANT specials in the stream; decode may return "" for them
	skip_special_tokens: false,
	clean_up_tokenization_spaces: false,
	})
	);
	}

	// Ensure specials are visible: if a special token decodes to empty,
	// replace it with its canonical name or a generic tag.
	tokens = tokens.map((tok, i) => {
	const id = ids[i];
	if (tok && tok.length) return tok;
	if (idToSpecial.has(id)) return idToSpecial.get(id); // e.g., <\|endoftext\|> for GPT-2
	return `<special:${id}>`;
	});

	if (myRun !== runId) return;

	state.tokens = tokens;
	state.ids = ids; // include specials in the count
	render();
	status(`Done. ${state.tokens.length} tokens.`);
	} catch (e) {
	console.error('Tokenize failed:', e);
	render();
	status('Error tokenizing. See console.');
	}
	}

	function render(){
	const tokens = Array.isArray(state.tokens) ? state.tokens : [];
	const ids = Array.isArray(state.ids) ? state.ids : [];

	const specialSet = new Set(tokenizer.all_special_ids \|\| []);

	tokensEl.innerHTML = '';
	tokens.forEach((tok, i) => {
	const chip = document.createElement('span');
	chip.className = 'chip';
	if (specialSet.has(ids[i])) chip.classList.add('special'); // <-- highlight specials
	chip.dataset.i = i;
	chip.textContent = tok;
	chip.addEventListener('mouseenter', ()=>highlight(i, true));
	chip.addEventListener('mouseleave', ()=>highlight(i, false));
	tokensEl.appendChild(chip);
	});

	idsEl.textContent = ids.join(' ');
	if (tokens.length === 0) status('Type to tokenize…');
	}


	function highlight(i, on){
	const ids = Array.isArray(state.ids) ? state.ids : [];
	if (!ids.length) return;

	const parts = ids.map((id, idx) => (idx === i && on) ? `[${id}]` : String(id));
	idsEl.textContent = parts.join(' ');

	const chip = tokensEl.querySelector(`[data-i="${i}"]`);
	if (chip) chip.classList.toggle('active', on);
	}

	const debounced = debounce(tokenize, 200);
	inputEl.addEventListener('input', debounced);

	modelSel.addEventListener('change', async ()=>{
	tokenizer = null; // force reload
	await loadTokenizer(modelSel.value);
	tokenize();
	});

	// Initial load
	await loadTokenizer(modelSel.value);
	tokenize();
	</script>
	</body>
	</html>