added train and model files

Browse files

Files changed (6) hide show

base/config.json +10 -0
base/decoder.py +213 -0
base/generate.py +120 -0
base/model.py +279 -0
base/run.py +101 -0
base/tokenizer.py +19 -0

base/config.json ADDED Viewed

	@@ -0,0 +1,10 @@

+{
+  "batch_size": 10,
+  "block_size": 512,
+  "d_model": 512,
+  "n_heads": 8,
+  "n_layers": 8,
+  "dropout": 0.18,
+  "norm_eps": 1e-5,
+  "learning_rate": 3e-5
+}

base/decoder.py ADDED Viewed

	@@ -0,0 +1,213 @@

+import json
+with open('config.json', 'r', encoding='utf-8') as file:
+  params = json.load(file)
+# required parameters
+block_size = params['block_size']
+d_model = params['d_model']
+n_head = params['n_heads']
+n_layers = params['n_layers']
+learning_rate = params['learning_rate']
+dropout = params['dropout']
+norm_eps = params['norm_eps']
+import torch
+import torch.nn as nn
+from torch.nn import functional as F
+device = 'cuda' if torch.cuda.is_available() else 'cpu'
+class RMSNorm(nn.Module):
+  def __init__(self, dim: int, eps: float = 1e-6):
+    super().__init__()
+    self.eps = eps
+    self.weight = nn.Parameter(torch.ones(dim))
+  def _norm(self, x):
+    return x * torch.rsqrt(x.pow(2).mean(-1, keepdim=True) + self.eps)
+  def forward(self, x):
+    output = self._norm(x.float()).type_as(x)
+    return output * self.weight
+class MaskedHead(nn.Module):
+  def __init__(self,
+      head_size: int,
+      d_model: int,
+      block_size: int,
+      dropout: float):
+    super().__init__()
+    self.key = nn.Linear(d_model, head_size, bias=True)
+    self.query = nn.Linear(d_model, head_size, bias=True)
+    self.value = nn.Linear(d_model, head_size, bias=True)
+    self.dropout = nn.Dropout(dropout)
+    self.register_buffer('tril', torch.tril(torch.ones(block_size, block_size)))
+  def forward(self, x: torch.Tensor):
+    B, T, C = x.shape
+    key = self.key(x)
+    query = self.query(x)
+    scores = torch.matmul(query ,key.transpose(-2, -1)) / (key.shape[-1]**-0.5)
+    scores = scores.masked_fill(self.tril[:T, :T] == 0, float('-inf'))
+    att_mat = F.softmax(scores, dim=-1)
+    att_mat = self.dropout(att_mat)
+    value = self.value(x)
+    output = torch.matmul(att_mat, value)
+    return output
+class UnMaskedHead(nn.Module):
+  def __init__(self,
+      head_size: int,
+      d_model: int,
+      block_size: int,
+      dropout: float):
+    super().__init__()
+    self.key = nn.Linear(d_model, head_size, bias=True)
+    self.query = nn.Linear(d_model, head_size, bias=True)
+    self.value = nn.Linear(d_model, head_size, bias=True)
+    self.dropout = nn.Dropout(dropout)
+    self.rel_pos_embd = nn.Parameter(torch.randn(block_size, block_size, head_size))
+  def forward(self, x: torch.Tensor):
+    B, T, C = x.shape
+    key = self.key(x)
+    query = self.query(x)
+    scores = torch.matmul(query ,key.transpose(-2, -1)) / (key.shape[-1]**-0.5)
+    rel_pos_scores = torch.einsum('btc,tvc->btv', query, self.rel_pos_embd[:T, :T])
+    scores = scores + rel_pos_scores
+    att_mat = F.softmax(scores, dim=-1)
+    att_mat = self.dropout(att_mat)
+    value = self.value(x)
+    output = torch.matmul(att_mat, value)
+    return output
+class MaskedAttention(nn.Module):
+  def __init__(self,
+      d_model: int,
+      block_size: int,
+      n_head : int,
+      dropout: float):
+    head_size = d_model // n_head
+    super().__init__()
+    self.heads = nn.ModuleList([MaskedHead(d_model=d_model, dropout=dropout, block_size=block_size, head_size=head_size) for _ in range(n_head)])
+    self.projection = nn.Linear(d_model, d_model)
+    self.dropout = nn.Dropout(dropout)
+  def forward(self, x: torch.Tensor):
+    out = torch.cat([h(x) for h in self.heads], dim=-1)
+    out = self.dropout(self.projection(out))
+    return out
+class UnMaskedAttention(nn.Module):
+  def __init__(self,
+      d_model: int,
+      block_size: int,
+      n_head : int,
+      dropout: float):
+    head_size = d_model // n_head
+    super().__init__()
+    self.heads = nn.ModuleList([UnMaskedHead(d_model=d_model, dropout=dropout, block_size=block_size, head_size=head_size) for _ in range(n_head)])
+    self.projection = nn.Linear(d_model, d_model)
+    self.dropout = nn.Dropout(dropout)
+  def forward(self, x: torch.Tensor):
+    out = torch.cat([h(x) for h in self.heads], dim=-1)
+    out = self.dropout(self.projection(out))
+    return out
+class FeedForward(nn.Module):
+  def __init__(self, d_model, dropout):
+    super().__init__()
+    self.net = nn.Sequential(
+      nn.Linear(d_model, 5 * d_model),
+      nn.GELU(),
+      nn.Linear(5 * d_model, 5 * d_model),
+      nn.Dropout(dropout),
+      nn.GELU(),
+      nn.Linear(5 * d_model, d_model),
+      nn.Dropout(dropout),
+      )
+  def forward(self, x: torch.Tensor):
+    return self.net(x)
+class DecoderBlock(nn.Module):
+  def __init__(self, d_model: int,
+        block_size: int,
+        n_head: int,
+        norm_eps: float,
+        dropout: float):
+    super().__init__()
+    self.m_att = MaskedAttention(n_head=n_head, d_model=d_model, dropout=dropout, block_size=block_size)
+    self.um_att = UnMaskedAttention(n_head=n_head, d_model=d_model, dropout=dropout, block_size=block_size)
+    self.ffwd = FeedForward(d_model, dropout)
+    self.dropout = nn.Dropout(dropout)
+    self.norm = RMSNorm(d_model, eps=norm_eps)
+  def forward(self, x: torch.Tensor):
+    x_out = self.m_att(self.norm(x))
+    x_out = x + self.dropout(x_out)
+    del x
+    x = self.um_att(self.norm(x_out))
+    x = x_out + self.dropout(x)
+    del x_out
+    x_out = self.ffwd(self.norm(x))
+    x_out = x + self.dropout(x_out)
+    del x
+    return x_out
+class Transformer(nn.Module):
+  def __init__(self, vocab_size: int):
+    super().__init__()
+    self.block_size = block_size
+    self.token_embeddings = nn.Embedding(vocab_size, d_model)
+    self.pos_encodings = nn.Embedding(block_size, d_model)
+    self.decoder = nn.Sequential(*[DecoderBlock(n_head=n_head, d_model=d_model, dropout=dropout, norm_eps=norm_eps, block_size=block_size) for _ in range(n_layers)])
+    self.norm_final = RMSNorm(d_model, eps=norm_eps)
+    self.linear_final = nn.Linear(d_model, vocab_size)
+    self.dropout = nn.Dropout(dropout)
+    self.apply(self._init_weights)
+  def _init_weights(self, module):
+    if isinstance(module, nn.Linear):
+      torch.nn.init.normal_(module.weight, mean=0.0, std=0.02)
+      if module.bias is not None:
+        torch.nn.init.zeros_(module.bias.data)
+    elif isinstance(module, nn.Embedding):
+      torch.nn.init.normal_(module.weight, mean=0.0, std=0.02)
+  def forward(self, idx, targets=None):
+    B, T = idx.shape
+    toked_model = self.token_embeddings(idx)
+    pos_encod = self.pos_encodings(torch.arange(T, device=device))
+    x = toked_model + pos_encod
+    x = self.decoder(x)
+    logits = self.linear_final(self.norm_final(x))
+    if targets is None:
+      loss = None
+    else:
+      B, T, C = logits.shape
+      logits = logits.view(B*T, C)
+      targets = targets.view(B*T)
+      loss = F.cross_entropy(logits, targets)
+    return logits, loss
+  def generate(self, idx: torch.Tensor, max_token: int=10):
+    for _ in range(max_token):
+      idx_cond = idx[:, -self.block_size:]
+      logits = self(idx_cond)
+      logits = logits[:, -1, :]
+      probs = F.softmax(logits, dim=-1)
+      idx_next = torch.argmax(probs, dim=-1)
+      idx = torch.cat((idx, idx_next), dim=1)
+    return idx

base/generate.py ADDED Viewed

	@@ -0,0 +1,120 @@

+import os
+current_directory = os.path.dirname(os.path.abspath(__file__))
+os.chdir(current_directory)
+import torch
+import torch.nn as nn
+from torch.nn import functional as F
+device = 'cuda' if torch.cuda.is_available() else 'cpu'
+from tokenizer import Tokenizer
+tokenizer = Tokenizer()
+vocab_size = tokenizer.get_vocab()
+from model import Transformer
+model = Transformer(vocab_size)
+checkpoint_path = '/content/drive/MyDrive/base-500m.pth'
+checkpoint = torch.load(checkpoint_path)
+model.load_state_dict(checkpoint)
+m = model.to(device)
+class Generate:
+  def __init__(self):
+    self.vocab_size = vocab_size
+    self.block_size = m.block_size
+  def generate(self, idx, max_new_tokens, temperature=1.0, top_k=0):
+    """
+      generate new tokens using the trained model
+    Args:
+      - idx (Tensor): input tensor representing initial token indices
+      - max_new_tokens (int): max no of new tokens to generate
+      - temperature (float): softmax temperature for sampling
+      - top_k (int): no of top tokens to consider in sampling
+    Returns:
+      - generated_tokens (list): list of generated token indices
+    """
+    generated_tokens = []
+    for _ in range(max_new_tokens):
+      idx_cond = idx[:, -m.block_size:]
+      logits, _ = self(idx_cond)
+      logits = logits[:, -1, :]
+      scaled_logits = logits / temperature
+      if top_k > 0:
+        scaled_logits = self._top_k_filtering(scaled_logits, top_k)
+      probs = F.softmax(scaled_logits, dim=-1)
+      sampled_idx = torch.multinomial(probs, num_samples=1)
+      generated_tokens.append(sampled_idx.item())
+      idx = torch.cat((idx, sampled_idx), dim=1)
+    return generated_tokens
+  def generate_masked_tokens(self, idx, masked_indices, temperature=1.0, top_k=0):
+    """
+      Generate predictions for masked tokens using the trained model.
+      Args:
+        - idx (Tensor): input tensor representing token indices
+        - masked_indices (Tensor): tensor of indices indicating masked positions
+        - temperature (float): softmax temperature for sampling
+        - top_k (int): no of top tokens to consider in sampling
+      Returns:
+        - predicted_tokens (Tensor): tensor of predicted token indices
+    """
+    B, T = idx.shape
+    toked_model = m.toked_model(idx)
+    pos_encod = m.pos_encod(torch.arange(T, device=device))
+    x = toked_model + pos_encod
+    for layer in m.enc_layer:
+      x_out = layer(x)
+    for layer in m.dec_layer:
+      x_final = layer(x, x_out)
+    x_masked = x_final.clone()
+    x_masked[masked_indices] = m.toked_model(torch.tensor([6], device=device))
+    x_masked = m.norm_final(x_masked)
+    logits = m.linear_final(x_masked)
+    masked_logits = logits[masked_indices].view(-1, logits.size(-1))
+    scaled_logits = masked_logits / temperature
+    if top_k > 0:
+      scaled_logits = self._top_k_filtering(scaled_logits, top_k)
+    probs = F.softmax(scaled_logits, dim=-1)
+    predicted_indices = torch.argmax(probs, dim=-1)
+    return predicted_indices
+  def _top_k_filtering(self, logits, top_k):
+    """
+      filter logits to keep only the top-k tokens
+    Args:
+      - logits (Tensor): input tensor representing unscaled logits
+      - top_k (int): no of top tokens to keep
+    Returns:
+      - filtered_logits (Tensor): filtered logits with only top-k tokens remaining
+    """
+    values, indices = torch.topk(logits, top_k, dim=-1)
+    min_value = values[:, -1].unsqueeze(-1).expand_as(logits)
+    filtered_logits = torch.where(logits < min_value, torch.ones_like(logits) * -float('inf'), logits)
+    return filtered_logits
+generator = Generate()
+target_text = "I was in the market when"
+context = torch.tensor([tokenizer.encode(target_text)], dtype=torch.long, device=device)
+generated_output = tokenizer.decode(generator.generate(context, max_new_tokens=50))
+print(target_text, generated_output)

base/model.py ADDED Viewed

	@@ -0,0 +1,279 @@

+import json
+with open('config.json', 'r', encoding='utf-8') as file:
+  params = json.load(file)
+# required parameters
+block_size = params['block_size']
+d_model = params['d_model']
+n_head = params['n_heads']
+n_layers = params['n_layers']
+learning_rate = params['learning_rate']
+dropout = params['dropout']
+norm_eps = params['norm_eps']
+import torch
+import torch.nn as nn
+from torch.nn import functional as F
+device = 'cuda' if torch.cuda.is_available() else 'cpu'
+class RMSNorm(nn.Module):
+  def __init__(self, dim: int, eps: float = 1e-6):
+    """
+      Initialize the RMSNorm normalization layer.
+      Args:
+        dim (int): The dimension of the input tensor.
+        eps (float, optional): A small value added to the denominator for numerical stability. Default is 1e-6.
+      Attributes:
+        eps (float): A small value added to the denominator for numerical stability.
+        weight (nn.Parameter): Learnable scaling parameter.
+    """
+    super().__init__()
+    self.eps = eps
+    self.weight = nn.Parameter(torch.ones(dim))
+  def _norm(self, x):
+    """
+      Apply the RMSNorm normalization to the input tensor.
+        Args:
+        x (torch.Tensor): The input tensor.
+      Returns:
+        torch.Tensor: The normalized tensor.
+    """
+    return x * torch.rsqrt(x.pow(2).mean(-1, keepdim=True) + self.eps)
+  def forward(self, x):
+    """
+      Forward pass through the RMSNorm layer.
+      Args:
+          x (torch.Tensor): The input tensor.
+      Returns:
+          torch.Tensor: The output tensor after applying RMSNorm.
+    """
+    output = self._norm(x.float()).type_as(x)
+    return output * self.weight
+class UnMaskedHead(nn.Module):
+  def __init__(self, head_size, d_model, block_size, dropout):
+    super().__init__()
+    self.key = nn.Linear(d_model, head_size, bias=True)
+    self.query = nn.Linear(d_model, head_size, bias=True)
+    self.value = nn.Linear(d_model, head_size, bias=True)
+    self.dropout = nn.Dropout(dropout)
+    self.rel_pos_embd = nn.Parameter(torch.randn(block_size, block_size, head_size))
+  def forward(self, x):
+    B, T, C = x.shape
+    key = self.key(x)
+    query = self.query(x)
+    scores = torch.matmul(query, key.transpose(-2, -1)) / (key.shape[-1] ** -0.5)
+    rel_pos_scores = torch.einsum('btc,tvc->btv', query, self.rel_pos_embd[:T, :T])
+    scores = scores + rel_pos_scores
+    att_mat = F.softmax(scores, dim=-1)
+    att_mat = self.dropout(att_mat)
+    value = self.value(x)
+    output = torch.matmul(att_mat, value)
+    return output
+class UnMaskedAttention(nn.Module):
+  def __init__(self, d_model, block_size, dropout, n_head):
+    head_size = d_model // n_head
+    super().__init__()
+    self.heads = nn.ModuleList([UnMaskedHead(d_model=d_model, dropout=dropout, block_size=block_size, head_size=head_size) for _ in range(n_head)])
+    self.proj = nn.Linear(n_head * head_size, d_model)
+    self.dropout = nn.Dropout(dropout)
+  def forward(self, x):
+    out = torch.cat([h(x) for h in self.heads], dim=-1)
+    out = self.dropout(self.proj(out))
+    return out
+class MaskedHead(nn.Module):
+  def __init__(self, d_model, head_size, dropout, block_size):
+    super().__init__()
+    self.key = nn.Linear(d_model, head_size, bias=False)
+    self.query = nn.Linear(d_model, head_size, bias=False)
+    self.value = nn.Linear(d_model, head_size, bias=False)
+    self.dropout = nn.Dropout(dropout)
+    self.register_buffer('tril', torch.tril(torch.ones(block_size, block_size)))
+  def forward(self, x):
+    B, T, C = x.shape
+    key = self.key(x)
+    query = self.query(x)
+    scores = torch.matmul(query, key.transpose(-2, -1)) / (key.shape[-1] ** -0.5)
+    scores = scores.masked_fill(self.tril[:T, :T] == 0, float('-inf'))
+    att_mat = F.softmax(scores, dim=-1)
+    att_mat = self.dropout(att_mat)
+    value = self.value(x)
+    output = torch.matmul(att_mat, value)
+    return output
+class CasualMaskedAttention(nn.Module):
+  def __init__(self, d_model, block_size, dropout, n_head):
+    head_size = d_model // n_head
+    super().__init__()
+    self.heads = nn.ModuleList([MaskedHead(d_model=d_model, dropout=dropout, block_size=block_size, head_size=head_size) for _ in range(n_head)])
+    self.proj = nn.Linear(n_head * head_size, d_model)
+    self.dropout = nn.Dropout(dropout)
+  def forward(self, x):
+    out = torch.cat([h(x) for h in self.heads], dim=-1)
+    out = self.dropout(self.proj(out))
+    return out
+class FinalHead(nn.Module):
+  def __init__(self, d_model, head_size, dropout, block_size):
+    super().__init__()
+    self.key = nn.Linear(d_model, head_size, bias=True)
+    self.query = nn.Linear(d_model, head_size, bias=True)
+    self.value = nn.Linear(d_model, head_size, bias=True)
+    self.dropout = nn.Dropout(dropout)
+  def forward(self, x, att):
+    B, T, C = x.shape
+    key = self.key(att)
+    query = self.query(att)
+    scores = torch.matmul(query, key.transpose(-2, -1)) / (key.shape[-1] ** -0.5)
+    att_mat = F.softmax(scores, dim=-1)
+    att_mat = self.dropout(att_mat)
+    value = self.value(x)
+    output = torch.matmul(att_mat, value)
+    return output
+class FinalAttention(nn.Module):
+  def __init__(self, d_model, block_size, dropout, n_head):
+    head_size = d_model // n_head
+    super().__init__()
+    self.heads = nn.ModuleList([FinalHead(d_model=d_model, dropout=dropout, block_size=block_size, head_size=head_size) for _ in range(n_head)])
+    self.proj = nn.Linear(n_head * head_size, d_model)
+    self.dropout = nn.Dropout(dropout)
+  def forward(self, x, att):
+    out = torch.cat([h(x, att) for h in self.heads], dim=-1)
+    out = self.dropout(self.proj(out))
+    return out
+class FeedForward(nn.Module):
+  def __init__(self, d_model, dropout):
+    super().__init__()
+    self.net = nn.Sequential(
+      nn.Linear(d_model, 4*d_model),
+      nn.GELU(),
+      nn.Linear(4*d_model, d_model),
+      nn.Dropout(dropout)
+    )
+  def forward(self, x):
+    return self.net(x)
+class EncoderNetwork(nn.Module):
+  def __init__(self, d_model, n_head, norm_eps, dropout, block_size):
+    super().__init__()
+    self.s_att = UnMaskedAttention(n_head=n_head, d_model=d_model, dropout=dropout, block_size=block_size)
+    self.ffwd = FeedForward(d_model, dropout)
+    self.dropout = nn.Dropout(dropout)
+    self.norm = RMSNorm(d_model, eps=norm_eps)
+  def forward(self, src):
+    src = self.norm(src)
+    src_out = src + self.dropout(self.s_att(src))
+    src = self.norm(src_out)
+    src_f = src + self.dropout(self.ffwd(src))
+    del src_out, src
+    return src_f
+class DecoderNetwork(nn.Module):
+  def __init__(self, d_model, n_head, norm_eps, dropout, block_size):
+    super().__init__()
+    self.m_att = CasualMaskedAttention(n_head=n_head, d_model=d_model, dropout=dropout, block_size=block_size)
+    self.f_att = FinalAttention(d_model=d_model, n_head=n_head, dropout=dropout, block_size=block_size)
+    self.ffwd = FeedForward(d_model, dropout)
+    self.dropout = nn.Dropout(dropout)
+    self.norm = RMSNorm(d_model, eps=norm_eps)
+  def forward(self, src, att):
+    m_att_out = self.norm(src)
+    m_out = src + self.dropout(self.m_att(m_att_out))
+    f_out = self.f_att(m_out, self.norm(att))
+    f_out = m_out + self.dropout(f_out)
+    src_f = self.norm(f_out)
+    src_f = f_out + self.dropout(self.ffwd(src_f))
+    del f_out, m_out, m_att_out, src, att
+    return src_f
+class Transformer(nn.Module):
+  def __init__(self, vocab_size):
+    super().__init__()
+    self.block_size = block_size
+    self.toked_model = nn.Embedding(vocab_size, d_model)
+    self.pos_encod = nn.Embedding(block_size, d_model)
+    self.enc_layer = nn.ModuleList([EncoderNetwork(n_head=n_head, norm_eps=norm_eps, block_size=block_size, dropout=dropout, d_model=d_model) for _ in range(n_layers)])
+    self.dec_layer = nn.ModuleList([DecoderNetwork(n_head=n_head, norm_eps=norm_eps, block_size=block_size, dropout=dropout, d_model=d_model) for _ in range(n_layers)])
+    self.norm_final = RMSNorm(d_model, eps=norm_eps)
+    self.linear_final = nn.Linear(d_model, vocab_size)
+    self.dropout = nn.Dropout(dropout)
+    self.apply(self._init_weights)
+  def _init_weights(self, module):
+    """
+      initialize weights of linear and embedding layers
+      Args:
+        - module (nn.Module): the module to initialize weights for
+    """
+    if isinstance(module, nn.Linear):
+      torch.nn.init.normal_(module.weight, mean=0.0, std=0.02)
+      if module.bias is not None:
+        torch.nn.init.zeros_(module.bias.data)
+    elif isinstance(module, nn.Embedding):
+      torch.nn.init.normal_(module.weight, mean=0.0, std=0.02)
+  def forward(self, idx, targets=None):
+    """
+      forward pass of the transformer model
+    Args:
+      - idx (Tensor): input tensor representing token indices
+      - targets (Tensor): target tensor for computing loss during training
+    Returns:
+      - logits (Tensor): output logits from the final linear layer
+      - loss (Tensor): optional. computed cross-entropy loss if targets are provided, else None
+    """
+    B, T = idx.shape
+    toked_model = self.toked_model(idx)
+    pos_encod = self.pos_encod(torch.arange(T, device=device))
+    x = toked_model + pos_encod
+    for layer in self.enc_layer:
+      x_out = layer(x)
+    for layer in self.dec_layer:
+      x_final = layer(x, x_out)
+    x_final = self.norm_final(x_final)
+    logits = self.linear_final(x_final)
+    if targets is None:
+      loss = None
+    else:
+      B, T, C = logits.shape
+      logits = logits.view(B*T, C)
+      targets = targets.view(B*T)
+      loss = F.cross_entropy(logits, targets)
+    return logits, loss

base/run.py ADDED Viewed

	@@ -0,0 +1,101 @@

+"""
+  use this file to train the model
+  working:
+    - imports vatious dependencies first, and then loads the training data
+    - tokenizes it, per-character basis
+    - loads the required hyper-parameters and the model file
+    - trains it till 'max_iters' and saves the model state, and generates outputs
+  with the current set configuration, model can reach upto ~60million parameters
+  and can become ~99% accurate with next token prediction
+"""
+import torch
+import json
+import os
+current_directory = os.path.dirname(os.path.abspath(__file__))
+os.chdir(current_directory)
+device = 'cuda' if torch.cuda.is_available() else 'cpu'
+with open('../datasets/wiki_176m.txt', 'r', encoding='utf-8') as file:
+  data = file.read()
+print(f"{(len(data)/1e6):.2f} million letters")
+from tokenizer import Tokenizer
+tokenizer = Tokenizer()
+vocab_size = tokenizer.get_vocab()
+# Train and test splits
+data = torch.tensor(tokenizer.encode(data), dtype=torch.long)
+n = int(0.9*len(data)) # first 90% will be train, rest val
+train_data = data[:n]
+val_data = data[n:]
+with open('config.json', 'r', encoding='utf-8') as file:
+  params = json.load(file)
+# required parameters
+batch_size = params['batch_size']
+block_size = params['block_size']
+max_iters = 1000
+eval_interval = 100
+eval_iters = 200
+learning_rate = params['learning_rate']
+torch.manual_seed(1400)
+# data loading
+def get_batch(split):
+    # generate a small batch of data of inputs x and targets y
+    data = train_data if split == 'train' else val_data
+    ix = torch.randint(len(data) - block_size, (batch_size,))
+    x = torch.stack([data[i:i+block_size] for i in ix])
+    y = torch.stack([data[i+1:i+block_size+1] for i in ix])
+    x, y = x.to(device), y.to(device)
+    return x, y
+@torch.no_grad()
+def estimate_loss():
+    out = {}
+    model.eval()
+    for split in ['train', 'val']:
+        losses = torch.zeros(eval_iters)
+        for k in range(eval_iters):
+            X, Y = get_batch(split)
+            logits, loss = model(X, Y)
+            losses[k] = loss.item()
+        out[split] = losses.mean()
+    model.train()
+    return out
+from model import Transformer
+model = Transformer(vocab_size)
+m = model.to(device)
+# no of parameters
+n_param = sum(p.numel() for p in m.parameters())/1e6
+print(f"{n_param:.2f} million")
+optimizer = torch.optim.AdamW(model.parameters(), lr=learning_rate)
+steps = []
+train_losses = []
+val_losses = []
+for iter in range(max_iters):
+  if iter % eval_interval == 0 or iter == max_iters - 1:
+    losses = estimate_loss()
+    print(f"step {iter}: train loss {losses['train']:.4f}, val loss {losses['val']:.4f}")
+    steps.append(iter)
+    train_losses.append(losses['train'])
+    val_losses.append(losses['val'])
+  xb, yb = get_batch('train')
+  logits, loss = model(xb, yb)
+  optimizer.zero_grad(set_to_none=True)
+  loss.backward()
+  optimizer.step()
+torch.save(model.state_dict(), f'enigma_{n_param:.0f}m.pth')

base/tokenizer.py ADDED Viewed

	@@ -0,0 +1,19 @@

+import tiktoken
+pre_encodings = 'p50k_base'
+pre_model = 'text-davinci-003'
+class Tokenizer:
+  def __init__(self, encoding=None, model=None):
+    self.encodings = encoding if encoding is not None else pre_encodings
+    self.model = model if model is not None else pre_model
+    self.tokenizer = tiktoken.get_encoding(self.encodings)
+    self.tokenizer = tiktoken.encoding_for_model(self.model)
+  def encode(self, data):
+    return self.tokenizer.encode(data)
+  def decode(self, tokens):
+    return self.tokenizer.decode(tokens)
+  def get_vocab(self):
+    return self.tokenizer.n_vocab