Vasudevakrishna commited on
Commit
1248b58
β€’
1 Parent(s): fe22298

Upload 4 files

Browse files
Files changed (4) hide show
  1. README.md +13 -13
  2. app.py +76 -0
  3. model.py +256 -0
  4. requirements.txt +4 -0
README.md CHANGED
@@ -1,13 +1,13 @@
1
- ---
2
- title: ERAV2 S21 124Model
3
- emoji: πŸŒ–
4
- colorFrom: green
5
- colorTo: blue
6
- sdk: gradio
7
- sdk_version: 4.37.2
8
- app_file: app.py
9
- pinned: false
10
- license: mit
11
- ---
12
-
13
- Check out the configuration reference at https://huggingface.co/docs/hub/spaces-config-reference
 
1
+ ---
2
+ title: ERAV2 S21 124Model
3
+ emoji: πŸŒ–
4
+ colorFrom: green
5
+ colorTo: blue
6
+ sdk: gradio
7
+ sdk_version: 4.37.2
8
+ app_file: app.py
9
+ pinned: false
10
+ license: mit
11
+ ---
12
+
13
+ Check out the configuration reference at https://huggingface.co/docs/hub/spaces-config-reference
app.py ADDED
@@ -0,0 +1,76 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import torch
2
+ import tiktoken
3
+ from model import GPT, GPTConfig
4
+ import gradio as gr
5
+ from torch.nn import functional as F
6
+
7
+ device = "cpu"
8
+ if torch.cuda.is_available():
9
+ device = "cuda"
10
+ elif hasattr(torch.backends, "mps") and torch.backends.mps.is_available():
11
+ device = "mps"
12
+
13
+ # STOP
14
+ num_return_sequences = 1
15
+ # max_length = 100
16
+ model = GPT(GPTConfig())
17
+ model.to(device)
18
+ model.load_state_dict(torch.load('./checkpoints/final_model.pth', map_location=device))
19
+
20
+ # Set the model to evaluation mode
21
+ model.eval()
22
+
23
+
24
+ def generate(text, max_length):
25
+ enc = tiktoken.get_encoding("gpt2")
26
+ tokens = enc.encode(text)
27
+ tokens = torch.tensor(tokens, dtype= torch.long) # (len,) #check tiktoken app
28
+ tokens = tokens.unsqueeze(0).repeat(num_return_sequences, 1) # (1, len)
29
+ x = tokens.to(device)
30
+
31
+ while x.size(1) < max_length:
32
+ # forward the model to get the logits
33
+ with torch.no_grad():
34
+ logits = model(x)[0] # (B, T, vocab_size)
35
+ # take the logits at the last position
36
+ logits = logits[:, -1, :] # (B, vocab_size)
37
+ # get the probabilities
38
+ probs = F.softmax(logits, dim=-1)
39
+ # do top-k sampling of 50 (huggingface pipeline default)
40
+ # topk_probs here becomes (5, 50), topk_indices is (5, 50)
41
+ topk_probs, topk_indices = torch.topk(probs, 50, dim=-1)
42
+ # select a token from the top-k probabilities
43
+ # note: multinomial does not demand the input to sum to 1
44
+ ix = torch.multinomial(topk_probs, 1) # (B, 1)
45
+ # gather the corresponding indices
46
+ xcol = torch.gather(topk_indices, -1, ix) # (B, 1)
47
+ # append to the sequence
48
+ x = torch.cat((x, xcol), dim=1)
49
+
50
+ # print the generated text
51
+ for i in range(num_return_sequences):
52
+ tokens = x[i, :max_length].tolist()
53
+ decoded = enc.decode(tokens)
54
+ return decoded
55
+
56
+ title = "Shakespeare Poem generation using GPT - 121M Model."
57
+ description = "A simple Gradio interface to demo genaration of shakespeare poem."
58
+ examples = [["Let us kill him, and we'll have corn at our own price."],
59
+ ["Would you proceed especially against Caius Marcius?"],
60
+ ["Nay, but speak not maliciously."]],
61
+ demo = gr.Interface(
62
+ generate,
63
+ inputs=[
64
+ gr.TextArea(label="Enter text"),
65
+ gr.Slider(10, 100, value = 10, step=1, label="Token Length"),
66
+ ],
67
+ outputs=[
68
+ gr.TextArea(label="Generated Text")
69
+ ],
70
+ title=title,
71
+ description=description,
72
+ examples=examples,
73
+ cache_examples=False,
74
+ live=True
75
+ )
76
+ demo.launch()
model.py ADDED
@@ -0,0 +1,256 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ # GPT-3 Paper
2
+ # add cosing delay
3
+ import os
4
+ import math
5
+ import time
6
+ import inspect
7
+ from dataclasses import dataclass
8
+ import torch
9
+ import torch.nn as nn
10
+ from torch.nn import functional as F
11
+
12
+
13
+ class CausalSelfAttention(nn.Module):
14
+
15
+ def __init__(self, config):
16
+ super().__init__()
17
+ assert config.n_embd % config.n_head == 0
18
+ # key, query, value projections for all heads, but in a batch
19
+ self.c_attn = nn.Linear(config.n_embd, 3 * config.n_embd)
20
+ # output projection
21
+ self.c_proj = nn.Linear(config.n_embd, config.n_embd)
22
+ self.c_proj.NANGPT_SCALE_INIT = 1
23
+ # regularization
24
+ self.n_head = config.n_head
25
+ self.n_embd = config.n_embd
26
+ self.register_buffer(
27
+ "bias",
28
+ torch.tril(torch.ones(config.block_size, config.block_size)).view(
29
+ 1, 1, config.block_size, config.block_size
30
+ ),
31
+ )
32
+
33
+ def forward(self, x):
34
+ B, T, C = (
35
+ x.size()
36
+ ) # batch size, sequence length, embedding dimensionality (n_embd)
37
+ # calculate query, key, values for all heads in batch and move head forward to be the batch dim
38
+ # nh is "number of heads", hs is "head size", and C (number of channels) = nh * hs
39
+ # e.g. in GPT-2 (124M), n_head=12, hs=64, so nh*hs=C=768 channels in the Transformer
40
+ qkv = self.c_attn(x)
41
+ q, k, v = qkv.split(self.n_embd, dim=2)
42
+ k = k.view(B, T, self.n_head, C // self.n_head).transpose(
43
+ 1, 2
44
+ ) # (B, nh, T, hs)
45
+ q = q.view(B, T, self.n_head, C // self.n_head).transpose(
46
+ 1, 2
47
+ ) # (B, nh, T, hs)
48
+ v = v.view(B, T, self.n_head, C // self.n_head).transpose(
49
+ 1, 2
50
+ ) # (B, nh, T, hs)
51
+
52
+ # att = (q @ k.transpose(-2, -1)) * (1.0 / math.sqrt(k.size(-1)))
53
+ # att = att.masked_fill(self.bias[:, :, :T, :T] == 0, float('-inf'))
54
+ # att = F.softmax(att, dim=-1)
55
+ # y = att @ v # (B, nh, T, T) x (B, nh, T, hs) -> (B, nh, T, hs)
56
+
57
+ y = F.scaled_dot_product_attention(q, k, v, is_causal=True) # Flash attention
58
+
59
+ y = (
60
+ y.transpose(1, 2).contiguous().view(B, T, C)
61
+ ) # re-assemble all head outputs side by side
62
+ # output projection
63
+ y = self.c_proj(y)
64
+ return y
65
+
66
+
67
+ class MLP(nn.Module):
68
+
69
+ def __init__(self, config):
70
+ super().__init__()
71
+ self.c_fc = nn.Linear(config.n_embd, 4 * config.n_embd)
72
+ self.gelu = nn.GELU(approximate="tanh")
73
+ self.c_proj = nn.Linear(4 * config.n_embd, config.n_embd)
74
+ self.c_proj.NANOGPT_SCALE_INIT = 1
75
+
76
+ def forward(self, x):
77
+ x = self.c_fc(x)
78
+ x = self.gelu(x)
79
+ x = self.c_proj(x)
80
+ return x
81
+
82
+
83
+ class Block(nn.Module):
84
+
85
+ def __init__(self, config):
86
+ super().__init__()
87
+ self.ln_1 = nn.LayerNorm(config.n_embd)
88
+ self.attn = CausalSelfAttention(config)
89
+ self.ln_2 = nn.LayerNorm(config.n_embd)
90
+ self.mlp = MLP(config)
91
+
92
+ def forward(self, x):
93
+ x = x + self.attn(self.ln_1(x))
94
+ x = x + self.mlp(self.ln_2(x))
95
+ return x
96
+
97
+
98
+ @dataclass
99
+ class GPTConfig:
100
+ block_size: int = 1024 # max sequence length
101
+ vocab_size: int = (
102
+ 50304 # number of tokens: 50,000 BPE merges + 256 bytes tokens + 1 <|endoftext|> token
103
+ )
104
+ n_layer: int = 12 # number of layers
105
+ n_head: int = 12 # number of heads
106
+ n_embd: int = 768 # embedding dimension
107
+
108
+
109
+ class GPT(nn.Module):
110
+
111
+ def __init__(self, config):
112
+ super().__init__()
113
+ self.config = config
114
+
115
+ self.transformer = nn.ModuleDict(
116
+ dict(
117
+ wte=nn.Embedding(config.vocab_size, config.n_embd),
118
+ wpe=nn.Embedding(config.block_size, config.n_embd),
119
+ h=nn.ModuleList([Block(config) for _ in range(config.n_layer)]),
120
+ ln_f=nn.LayerNorm(config.n_embd),
121
+ )
122
+ )
123
+ self.lm_head = nn.Linear(config.n_embd, config.vocab_size, bias=False)
124
+
125
+ # weight sharing
126
+ self.transformer.wte.weight = self.lm_head.weight
127
+
128
+ # weight initialization
129
+ self.apply(self._init_weights)
130
+
131
+ def _init_weights(self, module):
132
+ if isinstance(module, nn.Linear):
133
+ std = 0.02
134
+ if hasattr(module, "NANGPT_SCALE_INIT"):
135
+ std *= (2 * self.config.n_layer) ** -0.5
136
+ torch.nn.init.normal_(module.weight, mean=0.0, std=std)
137
+ if module.bias is not None:
138
+ torch.nn.init.zeros_(module.bias)
139
+ elif isinstance(module, nn.Embedding):
140
+ torch.nn.init.normal_(module.weight, mean=0.0, std=0.02)
141
+
142
+ def forward(self, idx, targets=None):
143
+ # idx is of shape (B, T)
144
+ B, T = idx.size()
145
+ assert (
146
+ T <= self.config.block_size
147
+ ), f"Cannot forward sequence of length {T}, block size is only {self.config.block_size}"
148
+ # forward the token and posisition embeddings
149
+ pos = torch.arange(0, T, dtype=torch.long, device=idx.device) # shape (T)
150
+ pos_emb = self.transformer.wpe(pos) # position embeddings of shape (T, n_embd)
151
+ tok_emb = self.transformer.wte(idx) # token embeddings of shape (B, T, n_embd)
152
+ x = tok_emb + pos_emb
153
+ # forward the blocks of the transformer
154
+ for block in self.transformer.h:
155
+ x = block(x)
156
+ # forward the final layernorm and the classifier
157
+ x = self.transformer.ln_f(x)
158
+ logits = self.lm_head(x) # (B, T, vocab_size)
159
+ loss = None
160
+ if targets is not None:
161
+ loss = F.cross_entropy(logits.view(-1, logits.size(-1)), targets.view(-1))
162
+ return logits, loss
163
+
164
+ @classmethod
165
+ def from_pretrained(cls, model_type):
166
+ """Loads pretrained GPT-2 model weights from huggingface"""
167
+ assert model_type in {"gpt2", "gpt2-medium", "gpt2-large", "gpt2-xl"}
168
+ from transformers import GPT2LMHeadModel
169
+
170
+ print("loading weights from pretrained gpt: %s" % model_type)
171
+
172
+ # n_layer, n_head and n_embd are determined from model_type
173
+ config_args = {
174
+ "gpt2": dict(n_layer=12, n_head=12, n_embd=768), # 124M params
175
+ "gpt2-medium": dict(n_layer=24, n_head=16, n_embd=1024), # 350M params
176
+ "gpt2-large": dict(n_layer=36, n_head=20, n_embd=1280), # 774M params
177
+ "gpt2-xl": dict(n_layer=48, n_head=25, n_embd=1600), # 1558M params
178
+ }[model_type]
179
+ config_args["vocab_size"] = 50257 # always 50257 for GPT model checkpoints
180
+ config_args["block_size"] = 1024 # always 1024 for GPT model checkpoints
181
+ # create a from-scratch initialized minGPT model
182
+ config = GPTConfig(**config_args)
183
+ model = GPT(config)
184
+ sd = model.state_dict()
185
+ sd_keys = sd.keys()
186
+ sd_keys = [
187
+ k for k in sd_keys if not k.endswith(".attn.bias")
188
+ ] # discard this mask / buffer, not a param
189
+
190
+ # init a huggingface/transformers model
191
+ model_hf = GPT2LMHeadModel.from_pretrained(model_type)
192
+ sd_hf = model_hf.state_dict()
193
+
194
+ # copy while ensuring all of the parameters are aligned and match in names and shapes
195
+ sd_keys_hf = sd_hf.keys()
196
+ sd_keys_hf = [
197
+ k for k in sd_keys_hf if not k.endswith(".attn.masked_bias")
198
+ ] # ignore these, just a buffer
199
+ sd_keys_hf = [
200
+ k for k in sd_keys_hf if not k.endswith(".attn.bias")
201
+ ] # same, just the mask (buffer)
202
+ transposed = [
203
+ "attn.c_attn.weight",
204
+ "attn.c_proj.weight",
205
+ "mlp.c_fc.weight",
206
+ "mlp.c_proj.weight",
207
+ ]
208
+ # basically the openai checkpoints use a "Conv1D" module, but we only want to use a vanilla Linear
209
+ # this means that we have to transpose these weights when we import them
210
+ assert len(sd_keys_hf) == len(
211
+ sd_keys
212
+ ), f"mismatched keys: {len(sd_keys_hf)} != {len(sd_keys)}"
213
+ for k in sd_keys_hf:
214
+ if any(k.endswith(w) for w in transposed):
215
+ # special treatment for the Conv1D weights we need to transpose
216
+ assert sd_hf[k].shape[::-1] == sd[k].shape
217
+ with torch.no_grad():
218
+ sd[k].copy_(sd_hf[k].t())
219
+ else:
220
+ # vanilla copy over the other parameters
221
+ assert sd_hf[k].shape == sd[k].shape
222
+ with torch.no_grad():
223
+ sd[k].copy_(sd_hf[k])
224
+
225
+ return model
226
+
227
+ def configure_optimizers(self, weight_decay, learning_rate, device_type):
228
+ # start with all of the candidate parameters (that require grad)
229
+ param_dict = {pn: p for pn, p in self.named_parameters()}
230
+ param_dict = {pn: p for pn, p in param_dict.items() if p.requires_grad}
231
+ # create optim groups. Any parameters that is 2D will be weight decayed, otherwise no.
232
+ # i.e. all weight tensors in matmuls + embeddings decay, all biases and layernorms don't.
233
+ decay_params = [p for n, p in param_dict.items() if p.dim() >= 2]
234
+ nodecay_params = [p for n, p in param_dict.items() if p.dim() < 2]
235
+ optim_groups = [
236
+ {"params": decay_params, "weight_decay": weight_decay},
237
+ {"params": nodecay_params, "weight_decay": 0.0},
238
+ ]
239
+ num_decay_params = sum(p.numel() for p in decay_params)
240
+ num_nodecay_params = sum(p.numel() for p in nodecay_params)
241
+
242
+ print(
243
+ f"num decayed parameter tensors: {len(decay_params)}, with {num_decay_params:,} parameters"
244
+ )
245
+ print(
246
+ f"num non-decayed parameter tensors: {len(nodecay_params)}, with {num_nodecay_params:,} parameters"
247
+ )
248
+ # Create AdamW optimizer and use the fused version if it is available
249
+ fused_available = "fused" in inspect.signature(torch.optim.AdamW).parameters
250
+ use_fused = fused_available and device_type == "cuda"
251
+
252
+ print(f"using fused AdamW: {use_fused}")
253
+ optimizer = torch.optim.AdamW(
254
+ optim_groups, lr=learning_rate, betas=(0.9, 0.95), eps=1e-8, fused=use_fused
255
+ )
256
+ return optimizer
requirements.txt ADDED
@@ -0,0 +1,4 @@
 
 
 
 
 
1
+ torch
2
+ numpy
3
+ dataclasses
4
+ tiktoken