File size: 17,214 Bytes
5d3fe93 |
1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 166 167 168 169 170 171 172 173 174 175 176 177 178 179 180 181 182 183 184 185 186 187 188 189 190 191 192 193 194 195 196 197 198 199 200 201 202 203 204 205 206 207 208 209 210 211 212 213 214 215 216 217 218 219 220 221 222 223 224 225 226 227 228 229 230 231 232 233 234 235 236 237 238 239 240 241 242 243 244 245 246 247 248 249 250 251 252 253 254 255 256 257 258 259 260 261 262 263 264 265 266 267 268 269 270 271 272 273 274 275 276 277 278 279 280 281 282 283 284 285 286 287 288 289 290 291 292 293 294 295 296 297 298 299 300 301 302 303 304 305 306 307 308 309 310 311 312 313 314 315 316 317 318 319 320 321 322 323 324 325 326 327 328 329 330 331 332 333 334 335 336 337 338 339 340 341 342 343 344 345 346 347 348 349 350 351 352 353 354 355 356 357 358 359 360 361 362 363 364 365 366 367 368 369 370 371 372 373 374 375 376 377 378 379 380 381 382 383 384 385 386 387 388 389 390 391 392 393 394 395 396 397 398 399 400 401 402 403 404 405 406 407 408 409 410 411 412 413 414 415 416 417 418 419 420 421 422 423 424 425 426 427 428 429 430 431 432 433 434 435 436 437 438 439 440 441 442 443 444 |
# -*- coding: utf-8 -*-
"""gpt-dev.ipynb
Automatically generated by Colab.
Original file is located at
https://colab.research.google.com/drive/1wAoJHP666APJNiFpvBVvJRpMwe04P4_1
"""
# when you restart a Jupyter Notebook, even if you see the outputs from the previous session, the variables, functions, and states in memory are lost. You need to re-run the cells to reload everything into memory.
import torch
import torch.nn as nn
from torch.nn import functional as F
import urllib.request
# Function to download the file and read its contents
def load_text_file(url):
"""Download and read the contents of a text file."""
# Download the file
response = urllib.request.urlopen(url)
content = response.read().decode('utf-8') # Read and decode the content
return content
# URL to download the text file
url = "https://raw.githubusercontent.com/PratyushChaudhary/My-LLM/refs/heads/main/cleaned_text_output.txt"
# Load the text into the variable `text`
text = load_text_file(url)
# Get no. of characters you are dealing with plus it's count
chars = sorted(list(set(text))) # all content in sorted order
vocab_size = len(chars) # no. of characters in file
# print(''.join(chars)) # join of these characters, unique ones
# print(vocab_size)
# hyperparameters
batch_size = 64 # how many independent sequences will we process in parallel?
block_size = 256 # what is the maximum content length for predictions?
max_iters = 5000
eval_interval = 500
learning_rate = 3e-4
device = 'cuda' if torch.cuda.is_available() else 'cpu'
eval_iters = 200
n_embd = 384
n_head = 6
n_layer = 6
dropout = 0.2
#----
torch.manual_seed(1337)
# create a mapping from characters to integers
stoi = { ch:i for i,ch in enumerate(chars) }
itos = { i:ch for i,ch in enumerate(chars) }
encode = lambda s: [stoi[c] for c in s] # encoder: take a string, output a list of integers
decode = lambda l: ''.join([itos[i] for i in l]) # decoder: take a list of integers, output a string
# # This code offers a very basic form of tokeniser, wherein the seuqence of integers is long but the integers in the sequence are small, in other words the vocabulary is small
# print(encode("hi there"))
# print(decode(encode("hi there")))
# let's now tokenise/encode our whole set of text
import torch # import PyTorch library
data = torch.tensor(encode(text), dtype=torch.long)
# data.shape returns tuple representing dimensions of tensor
'''
Tensor:
A fundamental data structure in ML.
A multi-dimensional array used to store data. It generalizes matrices to higher dimensions and can be thought of as a container for numerical data.
'''
# print(data.shape, data.dtype)
# print(data[:1000]) # the first 1000 characters, this is how the GPT will look our words
# This would be used to check at the end as how well our model is overfitting.
'''
Overfitting:
Overfitting is a common problem in machine learning and statistical modeling where a model learns not just the underlying patterns in the training data but also the noise or random fluctuations. This results in a model that performs very well on the training data but poorly on new, unseen data.
'''
# Let's now split up the data into train and validation sets
n = int(0.9*len(data)) # first 90% will be train data, rest would be validation
train_data = data[:n]
val_data = data[n:]
# We will train the transformer on chunks of dataset/text so that it's computationally inexpensive
# block size states the max length of our chunks
# block_size = 8
# train_data[:block_size+1]
# predictions are made on the basis of relative positions of these tokens
# x = train_data[:block_size]
# y = train_data[1:block_size+1]
# for t in range(block_size):
# context = x[:t+1]
# target = y[t]
# print(f"When input is {context} the target: {target}")
# Using the below code you ensure that any random numbers generated by PyTorch are reproducible, which means when you run the code multiple times, you'll get the same random numbers each time.
# This is useful for debugging or comparing results.
# The specific value doesn't matter, it's just used to initialise the random number generator in a consistent way.
# torch.manual_seed(1337)
# batch_size = 4 # how many independent sequences will we process in parallel?
# block_size = 8 # what is the maximum context length for predictions?
if __name__ == "__main__":
# Training logic (if any) goes here
# This will only run when you execute gpt_dev.py directly, not when it's imported
def get_batch(split):
# generate a small batch of data of inputs x and targets y
data = train_data if split == 'train' else val_data
# The below code generates batch_size (4) nos. in the range 0 to len(data)-block_size (exclusive)
# These integers, stored in ix, are used as starting indexes to slice the data
ix = torch.randint(len(data) - block_size, (batch_size,))
# stack up the rows into a tensor
x = torch.stack([data[i:i+block_size] for i in ix])
y = torch.stack([data[i+1:i+block_size+1] for i in ix])
x, y = x.to(device), y.to(device)
return x, y
@torch.no_grad()
def estimate_loss():
out = {}
model.eval()
for split in {'train', 'val'}:
losses = torch.zeros(eval_iters)
for k in range(eval_iters):
X, Y = get_batch(split)
logits, loss = model(X, Y)
losses[k] = loss.item()
out[split] = losses.mean()
model.train()
return out
pass
# xb, yb = get_batch('train')
# print('inputs:')
# print(xb.shape)
# print(xb)
# print('targets:')
# print(yb.shape)
# print(yb)
# print('----')
# for b in range(batch_size): # batch dimension
# for t in range(block_size): # time dimension
# context = xb[b, :t+1]
# target = yb[b, t]
# print(f"when input is {context.tolist()} the target: {target}")
# import torch.nn as nn
# # below syntax is such because nn is a submodule of torch, and Python needs the full module path (torch.nn) to find the functional module correctly.
# from torch.nn import functional as F
# torch.manual_seed(1337)
class Head(nn.Module):
'''one head of self-attention'''
def __init__(self, head_size):
super().__init__()
self.key = nn.Linear(n_embd, head_size, bias = False)
self.query = nn.Linear(n_embd, head_size, bias = False)
self.value = nn.Linear(n_embd, head_size, bias = False)
self.register_buffer('tril', torch.tril(torch.ones(block_size, block_size)))
self.dropout = nn.Dropout(dropout)
def forward(self, x):
B, T, C = x.shape
k = self.key(x) # (B, T, C)
q = self.query(x) # (B, T, C)
# complete attention scores ("affinities")
wei = q @ k.transpose(-2,-1) * k.shape[-1]**-0.5 # (B, T, hs) @ (B, hs, T) -> (B, T, T) wei = wei.masked_fill(self.tril[:T, :T] == 0, float('-inf')) # (B, T, T)
wei = wei.masked_fill(self.tril[:T, :T] == 0, float('-inf')) # (B, T, T)
wei = F.softmax(wei, dim=-1) # (B, T, T)
wei = self.dropout(wei)
# perform the weighted aggregation of the values
v = self.value(x) # (B, T, C)
out = wei @ v # (B, T, T) @ (B, T, C) -> (B, T, C)
return out
class MultiHeadAttention(nn.Module):
'''multiple heads of self-attention in parallel'''
def __init__(self, num_heads, head_size):
super().__init__()
self.heads = nn.ModuleList([Head(head_size) for _ in range(num_heads)])
self.proj = nn.Linear(head_size * num_heads, n_embd)
self.dropout = nn.Dropout(dropout)
def forward(self, x):
out = torch.cat([h(x) for h in self.heads], dim = -1)
out = self.dropout(self.proj(out))
return out
class FeedForward(nn.Module):
''' a simple linear layer followed by a non-linearity '''
def __init__(self, n_embd):
super().__init__()
self.net = nn.Sequential(
nn.Linear(n_embd, 4 * n_embd),
nn.ReLU(),
nn.Linear(4 * n_embd, n_embd),
nn.Dropout(dropout),
)
def forward(self, x):
return self.net(x)
class Block(nn.Module):
'''Transformer block: communication followed by computation'''
def __init__(self, n_embd, n_head):
# n_embd: embedding dimension, n_head: the number of heads we'd like
super().__init__()
head_size = n_embd // n_head
self.sa = MultiHeadAttention(n_head, head_size)
self.ffwd = FeedForward(n_embd)
self.ln1 = nn.LayerNorm(n_embd)
self.ln2 = nn.LayerNorm(n_embd)
def forward(self, x):
x = x + self.sa(self.ln1(x))
x = x + self.ffwd(self.ln2(x))
return x
# A bigram langauge model is a type of statistical language model that predicts the probability of a word based on the preceding word. It assumes that the occurence of a word depends only on the previous word.
class GPTLanguageModel(nn.Module):
def __init__(self):
super().__init__()
# each token directly reads off the logits for the next token from a lookup table
# nn.Embedding(vocab_size, vocab_size): vocab_size is the size of the vocabulary; each token is represented by a vector of size vocab_size, effectively creating a look up table of token embeddings.
self.token_embedding_table = nn.Embedding(vocab_size, n_embd)
self.position_embedding_table = nn.Embedding(block_size, n_embd)
self.blocks = nn.Sequential(*[Block(n_embd, n_head=n_head) for _ in range(n_layer)])
self.ln_f = nn.LayerNorm(n_embd) # final layer norm
# self.blocks = nn.Sequential(
# Block(n_embd, n_head = 4),
# Block(n_embd, n_head = 4),
# Block(n_embd, n_head = 4),
# nn.LayerNorm(n_embd),
# )
# self.sa_heads = MultiHeadAttention(4, n_embd//4) # i.e. 4 heads of 8-dimensional self-attention
# self.ffwd = FeedForward(n_embd)
self.lm_head = nn.Linear(n_embd, vocab_size)
self.apply(self._init_weights)
def _init_weights(self, module):
if isinstance(module, nn.Linear):
torch.nn.init.normal_(module.weight, mean = 0.0, std = 0.02)
if module.bias is not None:
torch.nn.init.zeros_(module.bias)
elif isinstance(module, nn.Embedding):
torch.nn.init.normal_(module.weight, mean = 0.0, std = 0.02)
# docstrings to be placed at same indentation to avoid confusion
'''
Batch is the number of sequences in the batch.
Time is the length of each sequence.
Channels is the size of the embedding (equal to vocab_size).
'''
# In context of neural networks, the "forward pass" refers to the process of passing input data through the network to obtain predictions or outputs.
def forward(self, idx, targets = None):
B, T = idx.shape
# idx and targets are both (B,T) tensor of integers
tok_emb = self.token_embedding_table(idx) # (Batch, Time, Channels) Batch -> 4 Time -> 8 Channels -> vocab_size
pos_emb = self.position_embedding_table(torch.arange(T, device = device)) # (T, C)
x = tok_emb + pos_emb # (B, T, C)
# x = self.sa_heads(x) # apply one head of self attention. (B, T, C)
# x = self.ffwd(x) # (B, T, C)
x = self.blocks(x) # (B, T, C)
x = self.ln_f(x) # (B, T, C)
logits = self.lm_head(x) # (B, T, vocab_size)
# if no targets, there's no loss to compute.
if targets is None:
loss = None
else:
# reshaping our logits such that they align with the syntax of cross_entropy
B, T, C = logits.shape
logits = logits.view(B*T, C)
targets = targets.view(B*T)
# quality of prediction wrt targets
# It can be understood as a probability distribution where the correct dimension would be looking like a peak
loss = F.cross_entropy(logits, targets)
return logits, loss
# defines the method to generate new tokens based on the current sequence idx.
def generate(self, idx, max_new_tokens):
# idx is (B, T) array of indices in the current context
for _ in range(max_new_tokens):
# crop idx to the last block_size tokens
idx_cond = idx[:, -block_size:]
# get the predictions
logits, loss = self(idx_cond)
# focus only on the last time step
logits = logits[:, -1, :] # becomes (B, C)
# apply softmax ro get probabilities
probs = F.softmax(logits, dim = -1) # (B, C)
# sample from the distribution
idx_next = torch.multinomial(probs, num_samples = 1) # (B, 1)
# append sampled index to the running sequence
idx = torch.cat((idx, idx_next), dim = 1) # (B, T+1)
return idx
model = GPTLanguageModel()
m = model.to(device)
# logits, loss = m(xb, yb)
# print(logits.shape)
# print(loss)
# print(decode(m.generate(idx = torch.zeros((1, 1), dtype = torch.long), max_new_tokens = 100)[0].tolist()))
# m = model.to(device)
# create a PyTorch optimiser
optimiser = torch.optim.AdamW(model.parameters(), lr = learning_rate)
# batch_size = 32 # This specifies that 32 samples will be processed in one training step called batch.
# for steps in range(50000): # This loop will run for 100 steps. Each step is one iteration of training using a batch of data.
# # sample a batch of data
# xb, yb = get_batch('train')
# # evaluate the loss
# # logits are the raw output of the model before any activation function, representing the predicted probabilities for each class.
# logits, loss = m(xb, yb)
# optimiser.zero_grad(set_to_none = True)
# loss.backward()
# optimiser.step()
# print(loss.item())
def train_model(self, max_iters, eval_interval, optimiser):
for iter in range(max_iters):
# every once in a while evaluate the loss on train and val sets
if iter % eval_interval == 0 or iter == max_iters - 1:
losses = estimate_loss()
print(f"step {iter}: train loss {losses['train']:.4f}, val loss {losses['val']:.4f}")
# sample a batch of data
xb, yb = get_batch('train')
# evaluate the loss
logits, loss = model(xb, yb)
optimiser.zero_grad(set_to_none = True)
loss.backward()
optimiser.step()
# generate from the model
context = torch.zeros((1,1), dtype = torch.long, device = device)
"""## The mathematical trick in self-attention"""
# consider the following toy example:
torch.manual_seed(1337)
B, T, C = 4, 8, 2 # batch, time, channels
x = torch.randn(B, T, C)
x.shape
# We want x[b, t] = mean_{i<=t} x[b, i]
xbow = torch.zeros((B, T, C))
for b in range(B):
for t in range(T):
xprev = x[b, :t+1] # (t, C)
xbow[b, t] = torch.mean(xprev, 0)
# version 2
wei = torch.tril(torch.ones(T, T))
wei = wei / wei.sum(1, keepdim = True)
xbow2 = wei @ x # (B, T, T) @ (B, T, C) ---> (B, T, C)
torch.allclose(xbow, xbow2)
# version 3: use Softmax
tril = torch.tril(torch.ones(T, T))
wei = torch.zeros((T, T))
wei = wei.masked_fill(tril == 0, float('-inf'))
wei = F.softmax(wei, dim = -1)
xbow3 = wei @ x
torch.allclose(xbow, xbow3)
# version 4: self-attention!
torch.manual_seed(1337)
B, T, C = 4, 8, 32 # batch, time, channels
x = torch.randn(B, T, C)
# let's see a single Head perform self-attention
head_size = 16
key = nn.Linear(C, head_size, bias = False)
query = nn.Linear(C, head_size, bias = False)
value = nn.Linear(C, head_size, bias = False)
k = key(x) # (B, T, 16)
q = query(x) # (B, T, 16)
wei = q @ k.transpose(-2, -1) # (B, T, 16) @ (B, 16, T) ---> (B, T, T)
tril = torch.tril(torch.ones(T, T))
# wei = torch.zeros((T, T))
wei = wei.masked_fill(tril == 0, float('-inf'))
wei = F.softmax(wei, dim = -1)
v = value(x)
out = wei @ v
k = torch.randn(B, T, head_size)
q = torch.randn(B, T, head_size)
wei = q @ k.transpose(-2, -1) * head_size**(-0.5)
torch.softmax(torch.tensor([0.1, -0.2, 0.3, -0.2, 0.5]), dim = -1)
# returns the lower triangular part of the given matrix
torch.tril(torch.ones(3, 3))
# we can be very efficient doing the above thing using matrix multiplication
torch.manual_seed(42)
a = torch.tril(torch.ones(3, 3))
# using the below syntax would get us the sum for every row in a as 1
a = a / torch.sum(a, 1, keepdim = True)
b = torch.randint(0, 10, (3, 2)).float()
c = a @ b
def generate_text(model, start_prompt, max_length=256, temperature=1.0):
input_ids = torch.tensor(encode(start_prompt), dtype=torch.long).unsqueeze(0).to(device)
model.eval()
generated_ids = input_ids.tolist()[0]
with torch.no_grad():
for _ in range(max_length):
logits, _ = model(input_ids)
logits = logits[:, -1, :] / temperature
probs = torch.nn.functional.softmax(logits, dim=-1)
next_token = torch.multinomial(probs, num_samples=1)
generated_ids.append(next_token.item())
input_ids = torch.cat((input_ids, next_token), dim=1)
return decode(generated_ids)
if __name__ == "__main__":
train_model() |