Tim Betz commited on
Commit
f0385bc
·
1 Parent(s): c5f6b01

added calliope model test

Browse files
Files changed (3) hide show
  1. app.py +37 -43
  2. checkpoints/Calliope-123m.pt +3 -0
  3. model.py +298 -0
app.py CHANGED
@@ -1,63 +1,57 @@
1
  import gradio as gr
2
- from huggingface_hub import InferenceClient
 
3
 
4
- """
5
- For more information on `huggingface_hub` Inference API support, please check the docs: https://huggingface.co/docs/huggingface_hub/v0.22.2/en/guides/inference
6
- """
7
- client = InferenceClient("HuggingFaceH4/zephyr-7b-beta")
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
8
 
9
 
10
  def respond(
11
  message,
12
- history: list[tuple[str, str]],
13
- system_message,
14
  max_tokens,
15
  temperature,
16
- top_p,
17
  ):
18
- messages = [{"role": "system", "content": system_message}]
19
-
20
- for val in history:
21
- if val[0]:
22
- messages.append({"role": "user", "content": val[0]})
23
- if val[1]:
24
- messages.append({"role": "assistant", "content": val[1]})
25
-
26
- messages.append({"role": "user", "content": message})
27
-
28
- response = ""
29
-
30
- for message in client.chat_completion(
31
- messages,
32
- max_tokens=max_tokens,
33
- stream=True,
34
  temperature=temperature,
35
- top_p=top_p,
36
- ):
37
- token = message.choices[0].delta.content
38
 
39
- response += token
40
- yield response
41
 
42
- """
43
- For information on how to customize the ChatInterface, peruse the gradio docs: https://www.gradio.app/docs/chatinterface
44
- """
45
  demo = gr.ChatInterface(
46
  respond,
47
  additional_inputs=[
48
- gr.Textbox(value="You are a friendly Chatbot.", label="System message"),
49
- gr.Slider(minimum=1, maximum=2048, value=512, step=1, label="Max new tokens"),
50
- gr.Slider(minimum=0.1, maximum=4.0, value=0.7, step=0.1, label="Temperature"),
51
- gr.Slider(
52
- minimum=0.1,
53
- maximum=1.0,
54
- value=0.95,
55
- step=0.05,
56
- label="Top-p (nucleus sampling)",
57
- ),
58
  ],
59
  )
60
 
61
 
62
  if __name__ == "__main__":
63
- demo.launch()
 
1
  import gradio as gr
2
+ import torch
3
+ from transformers import AutoTokenizer
4
 
5
+ from model import GPT, GPTConfig
6
+
7
+ DEVICE = torch.device("cuda" if torch.cuda.is_available() else "cpu")
8
+
9
+
10
+ def setup(model_path: str):
11
+ tokenizer = AutoTokenizer.from_pretrained("gpt2")
12
+ checkpoint = torch.load(model_path)
13
+ model = GPT(GPTConfig(**checkpoint["model_args"]))
14
+
15
+ # rename keys because of torch >=2.1
16
+ state_dict = {}
17
+ for key, val in checkpoint["model"].items():
18
+ if key.startswith("_orig_mod"):
19
+ state_dict[key[10:]] = val
20
+ else:
21
+ state_dict[key] = val
22
+ model.load_state_dict(state_dict)
23
+ model.to(DEVICE)
24
+ model.eval()
25
+ return model, tokenizer
26
+
27
+
28
+ model, tokenizer = setup("checkpoints/Calliope-123m.pt")
29
 
30
 
31
  def respond(
32
  message,
33
+ _history,
 
34
  max_tokens,
35
  temperature,
 
36
  ):
37
+ idx = model.generate(
38
+ torch.tensor(
39
+ [tokenizer.encode(message, add_special_tokens=False)], device=DEVICE
40
+ ),
41
+ max_new_tokens=max_tokens,
 
 
 
 
 
 
 
 
 
 
 
42
  temperature=temperature,
43
+ )
44
+ return tokenizer.decode(idx[0].cpu().numpy())
 
45
 
 
 
46
 
 
 
 
47
  demo = gr.ChatInterface(
48
  respond,
49
  additional_inputs=[
50
+ gr.Slider(minimum=1, maximum=256, value=128, step=1, label="Max new tokens"),
51
+ gr.Slider(minimum=0.1, maximum=4.0, value=0.5, step=0.1, label="Temperature"),
 
 
 
 
 
 
 
 
52
  ],
53
  )
54
 
55
 
56
  if __name__ == "__main__":
57
+ demo.launch()
checkpoints/Calliope-123m.pt ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:ff56d553e0aacef88f33c3241f1db6d14b4e95c429e71774067acf082bfcc196
3
+ size 495962355
model.py ADDED
@@ -0,0 +1,298 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ """
2
+ Full definition of a GPT Language Model, all of it in this single file.
3
+ References:
4
+ 1) the official GPT-2 TensorFlow implementation released by OpenAI:
5
+ https://github.com/openai/gpt-2/blob/master/src/model.py
6
+ 2) huggingface/transformers PyTorch implementation:
7
+ https://github.com/huggingface/transformers/blob/main/src/transformers/models/gpt2/modeling_gpt2.py
8
+ """
9
+
10
+ import inspect
11
+ import math
12
+ from dataclasses import dataclass
13
+
14
+ import torch
15
+ import torch.nn as nn
16
+ from torch.nn import functional as F
17
+ import bitsandbytes as bnb
18
+
19
+
20
+ @dataclass
21
+ class GPTConfig:
22
+ block_size: int = 1024
23
+ vocab_size: int = 50304 # GPT-2 vocab_size of 50257, padded up to nearest multiple of 64 for efficiency
24
+ n_layer: int = 12
25
+ n_head: int = 12
26
+ n_embd: int = 768
27
+ dropout: float = 0.0
28
+ bias: bool = False # True: bias in Linears and LayerNorms, like GPT-2. False: a bit better and faster
29
+ batch_size: int = 16
30
+ name: str = "GPT2"
31
+ mlp_type: str = "gpt" # gpt or llama
32
+ tokenizer: str = "gpt"
33
+
34
+
35
+ class LayerNorm(nn.Module):
36
+ """LayerNorm but with an optional bias. PyTorch doesn't support simply bias=False"""
37
+
38
+ def __init__(self, ndim, bias):
39
+ super().__init__()
40
+ self.weight = nn.Parameter(torch.ones(ndim))
41
+ self.bias = nn.Parameter(torch.zeros(ndim)) if bias else None
42
+
43
+ def forward(self, input):
44
+ return F.layer_norm(input, self.weight.shape, self.weight, self.bias, 1e-5)
45
+
46
+
47
+ class Rotary(torch.nn.Module):
48
+ def __init__(self, dim, base=10000):
49
+ super().__init__()
50
+ inv_freq = 1.0 / (base ** (torch.arange(0, dim, 2).float() / dim))
51
+ self.register_buffer("inv_freq", inv_freq)
52
+ self.seq_len_cached = None
53
+ self.cos_cached = None
54
+ self.sin_cached = None
55
+
56
+ def forward(self, x):
57
+ seq_len = x.shape[1]
58
+ if seq_len != self.seq_len_cached:
59
+ self.seq_len_cached = seq_len
60
+ t = torch.arange(seq_len, device=x.device).type_as(self.inv_freq)
61
+ freqs = torch.outer(t, self.inv_freq).to(x.device)
62
+ self.cos_cached = freqs.cos()
63
+ self.sin_cached = freqs.sin()
64
+ return self.cos_cached[None, :, None, :], self.sin_cached[None, :, None, :]
65
+
66
+
67
+ def apply_rotary_emb(x, cos, sin):
68
+ assert x.ndim == 4 # multihead attention
69
+ d = x.shape[3] // 2
70
+ x1 = x[..., :d]
71
+ x2 = x[..., d:]
72
+ y1 = x1 * cos + x2 * sin
73
+ y2 = x1 * (-sin) + x2 * cos
74
+ return torch.cat([y1, y2], 3)
75
+
76
+
77
+ class CausalSelfAttention(nn.Module):
78
+ def __init__(self, config):
79
+ super().__init__()
80
+ self.n_head = config.n_head
81
+ self.n_embd = config.n_embd
82
+ self.head_dim = self.n_embd // self.n_head
83
+ assert self.n_embd % self.n_head == 0
84
+ # key, query, value projections for all heads, but in a batch
85
+ self.c_attn = nn.Linear(self.n_embd, 3 * self.n_embd, bias=False)
86
+ # output projection
87
+ self.c_proj = nn.Linear(self.n_embd, self.n_embd, bias=False)
88
+ self.rotary = Rotary(self.head_dim)
89
+
90
+ def forward(self, x):
91
+ # batch size, sequence length, embedding dimensionality (n_embd)
92
+ B, T, C = x.size()
93
+ # calculate query, key, values for all heads in batch and move head forward to be the batch dim
94
+ qkv = self.c_attn(x)
95
+ q, k, v = qkv.split(self.n_embd, dim=2)
96
+ k = k.view(B, T, self.n_head, self.head_dim)
97
+ q = q.view(B, T, self.n_head, self.head_dim)
98
+ v = v.view(B, T, self.n_head, self.head_dim)
99
+ cos, sin = self.rotary(q)
100
+ q = apply_rotary_emb(q, cos, sin)
101
+ k = apply_rotary_emb(k, cos, sin)
102
+ y = F.scaled_dot_product_attention(
103
+ q.transpose(1, 2), k.transpose(1, 2), v.transpose(1, 2), is_causal=True
104
+ )
105
+ # re-assemble all head outputs side by side
106
+ y = y.transpose(1, 2).contiguous().view(B, T, C)
107
+ y = self.c_proj(y)
108
+ return y
109
+
110
+
111
+ class MLP(nn.Module):
112
+ def __init__(self, config):
113
+ super().__init__()
114
+ self.c_fc = nn.Linear(config.n_embd, 4 * config.n_embd, bias=config.bias)
115
+ self.gelu = nn.GELU()
116
+ self.c_proj = nn.Linear(4 * config.n_embd, config.n_embd, bias=config.bias)
117
+ self.dropout = nn.Dropout(config.dropout)
118
+
119
+ def forward(self, x):
120
+ x = self.c_fc(x)
121
+ x = self.gelu(x)
122
+ x = self.c_proj(x)
123
+ x = self.dropout(x)
124
+ return x
125
+
126
+
127
+ class LLaMAMLP(nn.Module):
128
+ def __init__(self, config: GPTConfig) -> None:
129
+ super().__init__()
130
+ self.fc_1 = nn.Linear(config.n_embd, int(3.5 * config.n_embd), bias=config.bias)
131
+ self.fc_2 = nn.Linear(config.n_embd, int(3.5 * config.n_embd), bias=config.bias)
132
+ self.proj = nn.Linear(int(3.5 * config.n_embd), config.n_embd, bias=config.bias)
133
+
134
+ def forward(self, x: torch.Tensor) -> torch.Tensor:
135
+ x_fc_1 = self.fc_1(x)
136
+ x_fc_2 = self.fc_2(x)
137
+ x = torch.nn.functional.silu(x_fc_1) * x_fc_2
138
+ return self.proj(x)
139
+
140
+
141
+ class Block(nn.Module):
142
+ def __init__(self, config):
143
+ super().__init__()
144
+ self.ln_1 = LayerNorm(config.n_embd, bias=config.bias)
145
+ self.attn = CausalSelfAttention(config)
146
+ self.ln_2 = LayerNorm(config.n_embd, bias=config.bias)
147
+ if config.mlp_type == "gpt":
148
+ self.mlp = MLP(config)
149
+ elif config.mlp_type == "llama":
150
+ self.mlp = LLaMAMLP(config)
151
+ else:
152
+ self.mlp = MLP(config)
153
+
154
+ def forward(self, x):
155
+ x = x + self.attn(self.ln_1(x))
156
+ x = x + self.mlp(self.ln_2(x))
157
+ return x
158
+
159
+
160
+ class GPT(nn.Module):
161
+ def __init__(self, config):
162
+ super().__init__()
163
+ assert config.vocab_size is not None
164
+ assert config.block_size is not None
165
+ self.config = config
166
+
167
+ self.transformer = nn.ModuleDict(
168
+ dict(
169
+ wte=nn.Embedding(config.vocab_size, config.n_embd),
170
+ wpe=nn.Embedding(config.block_size, config.n_embd),
171
+ drop=nn.Dropout(config.dropout),
172
+ h=nn.ModuleList([Block(config) for _ in range(config.n_layer)]),
173
+ ln_f=LayerNorm(config.n_embd, bias=config.bias),
174
+ )
175
+ )
176
+ self.lm_head = nn.Linear(config.n_embd, config.vocab_size, bias=False)
177
+ self.transformer.wte.weight = self.lm_head.weight # weight tying
178
+
179
+ self.apply(self._init_weights)
180
+ # apply special scaled init to the residual projections, per GPT-2 paper
181
+ for pn, p in self.named_parameters():
182
+ if pn.endswith("c_proj.weight"):
183
+ torch.nn.init.normal_(
184
+ p, mean=0.0, std=0.02 / math.sqrt(2 * config.n_layer)
185
+ )
186
+ print("number of parameters: %.2fM" % (self.get_num_params() / 1e6,))
187
+
188
+ def get_num_params(self, non_embedding=True):
189
+ """
190
+ Return the number of parameters in the model.
191
+ For non-embedding count (default), the position embeddings get subtracted.
192
+ The token embeddings would too, except due to the parameter sharing these
193
+ params are actually used as weights in the final layer, so we include them.
194
+ """
195
+ n_params = sum(p.numel() for p in self.parameters())
196
+ if non_embedding:
197
+ n_params -= self.transformer.wpe.weight.numel()
198
+ return n_params
199
+
200
+ def _init_weights(self, module):
201
+ if isinstance(module, nn.Linear):
202
+ torch.nn.init.normal_(module.weight, mean=0.0, std=0.02)
203
+ if module.bias is not None:
204
+ torch.nn.init.zeros_(module.bias)
205
+ elif isinstance(module, nn.Embedding):
206
+ torch.nn.init.normal_(module.weight, mean=0.0, std=0.02)
207
+
208
+ def forward(self, idx, targets=None):
209
+ device = idx.device
210
+ _, t = idx.size()
211
+ assert (
212
+ t <= self.config.block_size
213
+ ), f"Cannot forward sequence of length {t}, block size is only {self.config.block_size}"
214
+ pos = torch.arange(0, t, dtype=torch.long, device=device) # shape (t)
215
+
216
+ # forward the GPT model itself
217
+ tok_emb = self.transformer.wte(idx) # token embeddings of shape (b, t, n_embd)
218
+ pos_emb = self.transformer.wpe(pos) # position embeddings of shape (t, n_embd)
219
+ x = self.transformer.drop(tok_emb + pos_emb)
220
+ for block in self.transformer.h:
221
+ x = block(x)
222
+ x = self.transformer.ln_f(x)
223
+
224
+ if targets is not None:
225
+ # if we are given some desired targets also calculate the loss
226
+ logits = self.lm_head(x)
227
+ loss = F.cross_entropy(
228
+ logits.view(-1, logits.size(-1)), targets.view(-1), ignore_index=-1
229
+ )
230
+ else:
231
+ # inference-time mini-optimization: only forward the lm_head on the very last position
232
+ # note: using list [-1] to preserve the time dim
233
+ logits = self.lm_head(x[:, [-1], :])
234
+ loss = None
235
+
236
+ return logits, loss
237
+
238
+ def configure_optimizers(
239
+ self, weight_decay, learning_rate, betas, device_type, optim="torch"
240
+ ):
241
+ # start with all of the candidate parameters
242
+ param_dict = {pn: p for pn, p in self.named_parameters()}
243
+ # filter out those that do not require grad
244
+ param_dict = {pn: p for pn, p in param_dict.items() if p.requires_grad}
245
+ # create optim groups. Any parameters that is 2D will be weight decayed, otherwise no.
246
+ # i.e. all weight tensors in matmuls + embeddings decay, all biases and layernorms don't.
247
+ decay_params = [p for _, p in param_dict.items() if p.dim() >= 2]
248
+ nodecay_params = [p for _, p in param_dict.items() if p.dim() < 2]
249
+ optim_groups = [
250
+ {"params": decay_params, "weight_decay": weight_decay},
251
+ {"params": nodecay_params, "weight_decay": 0.0},
252
+ ]
253
+ # Create AdamW optimizer and use the fused version if it is available
254
+ fused_available = "fused" in inspect.signature(torch.optim.AdamW).parameters
255
+ use_fused = fused_available and device_type == "cuda"
256
+ if optim == "torch":
257
+ optimizer = torch.optim.AdamW(
258
+ optim_groups,
259
+ lr=learning_rate,
260
+ betas=betas,
261
+ fused=use_fused,
262
+ foreach=False,
263
+ )
264
+ print(f"using fused AdamW: {use_fused}")
265
+ elif optim == "bnb":
266
+ optimizer = bnb.optim.AdamW8bit(optim_groups, lr=learning_rate, betas=betas)
267
+ print("Using bnb AdamW8bit")
268
+ else:
269
+ print("Invalid optim type")
270
+ return None
271
+ return optimizer
272
+
273
+ @torch.no_grad()
274
+ def generate(self, idx, max_new_tokens, temperature=1.0, top_k=None):
275
+ """
276
+ Take a conditioning sequence of indices idx (LongTensor of shape (b,t)) and complete
277
+ the sequence max_new_tokens times, feeding the predictions back into the model each time.
278
+ Most likely you'll want to make sure to be in model.eval() mode of operation for this.
279
+ """
280
+ for _ in range(max_new_tokens):
281
+ # if the sequence context is growing too long we must crop it at block_size
282
+ if idx.size(1) > self.config.block_size:
283
+ idx = idx[:, -self.config.block_size :]
284
+ # forward the model to get the logits for the index in the sequence
285
+ logits, _ = self(idx)
286
+ # pluck the logits at the final step and scale by desired temperature
287
+ logits = logits[:, -1, :] / temperature
288
+ # optionally crop the logits to only the top k options
289
+ if top_k is not None:
290
+ v, _ = torch.topk(logits, min(top_k, logits.size(-1)))
291
+ logits[logits < v[:, [-1]]] = -float("Inf")
292
+ # apply softmax to convert logits to (normalized) probabilities
293
+ probs = F.softmax(logits, dim=-1)
294
+ # sample from the distribution
295
+ idx_next = torch.multinomial(probs, num_samples=1)
296
+ # append sampled index to the running sequence and continue
297
+ idx = torch.cat((idx, idx_next), dim=1)
298
+ return idx