Spaces:

mynti
/

calliope-demo

Runtime error

calliope-demo / model.py

Tim Betz

bitsandbytes removed for inference

bcf8fcd over 1 year ago

10.3 kB

	"""
	Full definition of a GPT Language Model, all of it in this single file.
	References:
	1) the official GPT-2 TensorFlow implementation released by OpenAI:
	https://github.com/openai/gpt-2/blob/master/src/model.py
	2) huggingface/transformers PyTorch implementation:
	https://github.com/huggingface/transformers/blob/main/src/transformers/models/gpt2/modeling_gpt2.py
	"""

	import math
	from dataclasses import dataclass

	import torch
	import torch.nn as nn
	from torch.nn import functional as F


	@dataclass
	class GPTConfig:
	block_size: int = 1024
	vocab_size: int = 50304 # GPT-2 vocab_size of 50257, padded up to nearest multiple of 64 for efficiency
	n_layer: int = 12
	n_head: int = 12
	n_embd: int = 768
	dropout: float = 0.0
	bias: bool = False # True: bias in Linears and LayerNorms, like GPT-2. False: a bit better and faster
	batch_size: int = 16
	name: str = "GPT2"
	mlp_type: str = "gpt" # gpt or llama
	tokenizer: str = "gpt"


	class LayerNorm(nn.Module):
	"""LayerNorm but with an optional bias. PyTorch doesn't support simply bias=False"""

	def __init__(self, ndim, bias):
	super().__init__()
	self.weight = nn.Parameter(torch.ones(ndim))
	self.bias = nn.Parameter(torch.zeros(ndim)) if bias else None

	def forward(self, input):
	return F.layer_norm(input, self.weight.shape, self.weight, self.bias, 1e-5)


	class Rotary(torch.nn.Module):
	def __init__(self, dim, base=10000):
	super().__init__()
	inv_freq = 1.0 / (base ** (torch.arange(0, dim, 2).float() / dim))
	self.register_buffer("inv_freq", inv_freq)
	self.seq_len_cached = None
	self.cos_cached = None
	self.sin_cached = None

	def forward(self, x):
	seq_len = x.shape[1]
	if seq_len != self.seq_len_cached:
	self.seq_len_cached = seq_len
	t = torch.arange(seq_len, device=x.device).type_as(self.inv_freq)
	freqs = torch.outer(t, self.inv_freq).to(x.device)
	self.cos_cached = freqs.cos()
	self.sin_cached = freqs.sin()
	return self.cos_cached[None, :, None, :], self.sin_cached[None, :, None, :]


	def apply_rotary_emb(x, cos, sin):
	assert x.ndim == 4 # multihead attention
	d = x.shape[3] // 2
	x1 = x[..., :d]
	x2 = x[..., d:]
	y1 = x1 * cos + x2 * sin
	y2 = x1 * (-sin) + x2 * cos
	return torch.cat([y1, y2], 3)


	class CausalSelfAttention(nn.Module):
	def __init__(self, config):
	super().__init__()
	self.n_head = config.n_head
	self.n_embd = config.n_embd
	self.head_dim = self.n_embd // self.n_head
	assert self.n_embd % self.n_head == 0
	# key, query, value projections for all heads, but in a batch
	self.c_attn = nn.Linear(self.n_embd, 3 * self.n_embd, bias=False)
	# output projection
	self.c_proj = nn.Linear(self.n_embd, self.n_embd, bias=False)
	self.rotary = Rotary(self.head_dim)

	def forward(self, x):
	# batch size, sequence length, embedding dimensionality (n_embd)
	B, T, C = x.size()
	# calculate query, key, values for all heads in batch and move head forward to be the batch dim
	qkv = self.c_attn(x)
	q, k, v = qkv.split(self.n_embd, dim=2)
	k = k.view(B, T, self.n_head, self.head_dim)
	q = q.view(B, T, self.n_head, self.head_dim)
	v = v.view(B, T, self.n_head, self.head_dim)
	cos, sin = self.rotary(q)
	q = apply_rotary_emb(q, cos, sin)
	k = apply_rotary_emb(k, cos, sin)
	y = F.scaled_dot_product_attention(
	q.transpose(1, 2), k.transpose(1, 2), v.transpose(1, 2), is_causal=True
	)
	# re-assemble all head outputs side by side
	y = y.transpose(1, 2).contiguous().view(B, T, C)
	y = self.c_proj(y)
	return y


	class MLP(nn.Module):
	def __init__(self, config):
	super().__init__()
	self.c_fc = nn.Linear(config.n_embd, 4 * config.n_embd, bias=config.bias)
	self.gelu = nn.GELU()
	self.c_proj = nn.Linear(4 * config.n_embd, config.n_embd, bias=config.bias)
	self.dropout = nn.Dropout(config.dropout)

	def forward(self, x):
	x = self.c_fc(x)
	x = self.gelu(x)
	x = self.c_proj(x)
	x = self.dropout(x)
	return x


	class LLaMAMLP(nn.Module):
	def __init__(self, config: GPTConfig) -> None:
	super().__init__()
	self.fc_1 = nn.Linear(config.n_embd, int(3.5 * config.n_embd), bias=config.bias)
	self.fc_2 = nn.Linear(config.n_embd, int(3.5 * config.n_embd), bias=config.bias)
	self.proj = nn.Linear(int(3.5 * config.n_embd), config.n_embd, bias=config.bias)

	def forward(self, x: torch.Tensor) -> torch.Tensor:
	x_fc_1 = self.fc_1(x)
	x_fc_2 = self.fc_2(x)
	x = torch.nn.functional.silu(x_fc_1) * x_fc_2
	return self.proj(x)


	class Block(nn.Module):
	def __init__(self, config):
	super().__init__()
	self.ln_1 = LayerNorm(config.n_embd, bias=config.bias)
	self.attn = CausalSelfAttention(config)
	self.ln_2 = LayerNorm(config.n_embd, bias=config.bias)
	if config.mlp_type == "gpt":
	self.mlp = MLP(config)
	elif config.mlp_type == "llama":
	self.mlp = LLaMAMLP(config)
	else:
	self.mlp = MLP(config)

	def forward(self, x):
	x = x + self.attn(self.ln_1(x))
	x = x + self.mlp(self.ln_2(x))
	return x


	class GPT(nn.Module):
	def __init__(self, config):
	super().__init__()
	assert config.vocab_size is not None
	assert config.block_size is not None
	self.config = config

	self.transformer = nn.ModuleDict(
	dict(
	wte=nn.Embedding(config.vocab_size, config.n_embd),
	wpe=nn.Embedding(config.block_size, config.n_embd),
	drop=nn.Dropout(config.dropout),
	h=nn.ModuleList([Block(config) for _ in range(config.n_layer)]),
	ln_f=LayerNorm(config.n_embd, bias=config.bias),
	)
	)
	self.lm_head = nn.Linear(config.n_embd, config.vocab_size, bias=False)
	self.transformer.wte.weight = self.lm_head.weight # weight tying

	self.apply(self._init_weights)
	# apply special scaled init to the residual projections, per GPT-2 paper
	for pn, p in self.named_parameters():
	if pn.endswith("c_proj.weight"):
	torch.nn.init.normal_(
	p, mean=0.0, std=0.02 / math.sqrt(2 * config.n_layer)
	)
	print("number of parameters: %.2fM" % (self.get_num_params() / 1e6,))

	def get_num_params(self, non_embedding=True):
	"""
	Return the number of parameters in the model.
	For non-embedding count (default), the position embeddings get subtracted.
	The token embeddings would too, except due to the parameter sharing these
	params are actually used as weights in the final layer, so we include them.
	"""
	n_params = sum(p.numel() for p in self.parameters())
	if non_embedding:
	n_params -= self.transformer.wpe.weight.numel()
	return n_params

	def _init_weights(self, module):
	if isinstance(module, nn.Linear):
	torch.nn.init.normal_(module.weight, mean=0.0, std=0.02)
	if module.bias is not None:
	torch.nn.init.zeros_(module.bias)
	elif isinstance(module, nn.Embedding):
	torch.nn.init.normal_(module.weight, mean=0.0, std=0.02)

	def forward(self, idx, targets=None):
	device = idx.device
	_, t = idx.size()
	assert (
	t <= self.config.block_size
	), f"Cannot forward sequence of length {t}, block size is only {self.config.block_size}"
	pos = torch.arange(0, t, dtype=torch.long, device=device) # shape (t)

	# forward the GPT model itself
	tok_emb = self.transformer.wte(idx) # token embeddings of shape (b, t, n_embd)
	pos_emb = self.transformer.wpe(pos) # position embeddings of shape (t, n_embd)
	x = self.transformer.drop(tok_emb + pos_emb)
	for block in self.transformer.h:
	x = block(x)
	x = self.transformer.ln_f(x)

	if targets is not None:
	# if we are given some desired targets also calculate the loss
	logits = self.lm_head(x)
	loss = F.cross_entropy(
	logits.view(-1, logits.size(-1)), targets.view(-1), ignore_index=-1
	)
	else:
	# inference-time mini-optimization: only forward the lm_head on the very last position
	# note: using list [-1] to preserve the time dim
	logits = self.lm_head(x[:, [-1], :])
	loss = None

	return logits, loss


	@torch.no_grad()
	def generate(self, idx, max_new_tokens, temperature=1.0, top_k=None):
	"""
	Take a conditioning sequence of indices idx (LongTensor of shape (b,t)) and complete
	the sequence max_new_tokens times, feeding the predictions back into the model each time.
	Most likely you'll want to make sure to be in model.eval() mode of operation for this.
	"""
	for _ in range(max_new_tokens):
	# if the sequence context is growing too long we must crop it at block_size
	if idx.size(1) > self.config.block_size:
	idx = idx[:, -self.config.block_size :]
	# forward the model to get the logits for the index in the sequence
	logits, _ = self(idx)
	# pluck the logits at the final step and scale by desired temperature
	logits = logits[:, -1, :] / temperature
	# optionally crop the logits to only the top k options
	if top_k is not None:
	v, _ = torch.topk(logits, min(top_k, logits.size(-1)))
	logits[logits < v[:, [-1]]] = -float("Inf")
	# apply softmax to convert logits to (normalized) probabilities
	probs = F.softmax(logits, dim=-1)
	# sample from the distribution
	idx_next = torch.multinomial(probs, num_samples=1)
	# append sampled index to the running sequence and continue
	idx = torch.cat((idx, idx_next), dim=1)
	return idx