arcAman07 commited on
Commit
7da7e5d
·
1 Parent(s): 4f3596b

Delete model.py

Browse files
Files changed (1) hide show
  1. model.py +0 -118
model.py DELETED
@@ -1,118 +0,0 @@
1
- import torch
2
- import torch.nn as nn
3
- import torch.nn.functional as F
4
- import matplotlib.pyplot as plt
5
-
6
- # hyperparameters
7
- batch_size = 16 # how many independent sequences will we process in parallel?
8
- block_size = 64 # what is the maximum context length for predictions?
9
- max_iters = 5000
10
- eval_interval = 100
11
- learning_rate = 1e-3
12
- device = 'cuda' if torch.cuda.is_available() else 'cpu'
13
- eval_iters = 200
14
- n_embd = 128
15
- n_head = 8
16
- n_layer = 4
17
- dropout = 0.0
18
- vocab = 101
19
- # ------------
20
-
21
-
22
- class Head(nn.Module):
23
- def __init__(self, head_size):
24
- super(Head,self).__init__()
25
- self.head_size = head_size
26
- self.dropout = nn.Dropout(dropout)
27
- self.key = nn.Linear(n_embd, head_size, bias=False)
28
- self.query = nn.Linear(n_embd, head_size, bias=False)
29
- self.value = nn.Linear(n_embd, head_size, bias=False)
30
- self.register_buffer('tril', torch.tril(torch.ones(block_size, block_size)))
31
- def forward(self,x):
32
- k = self.key(x)
33
- q = self.query(x)
34
- wei = q @ k.transpose(-2,-1) * (self.head_size ** -0.5)
35
- wei = wei.masked_fill(self.tril == 0, float('-inf'))
36
- wei = F.softmax(wei, dim=-1)
37
- wei = self.dropout(wei)
38
- v = self.value(x)
39
- out = wei @ v
40
- return out
41
-
42
- class MultiHeadAttention(nn.Module):
43
- def __init__(self, n_head, head_size):
44
- super(MultiHeadAttention,self).__init__()
45
- self.head_size = head_size
46
- self.n_head = n_head
47
- self.heads = nn.ModuleList([Head(head_size) for _ in range(n_head)])
48
- self.out = nn.Linear(n_embd, n_embd)
49
- self.dropout = nn.Dropout(dropout)
50
-
51
- def forward(self,x):
52
- out = torch.cat([h(x) for h in self.heads], dim=-1)
53
- out = self.out(out)
54
- out = self.dropout(out)
55
- return out
56
-
57
- class FeedForwardLayer(nn.Module):
58
- def __init__(self, n_embd):
59
- super(FeedForwardLayer, self).__init__()
60
- self.n_embd = n_embd
61
- self.fc1 = nn.Linear(n_embd, 4*n_embd)
62
- self.fc2 = nn.Linear(4*n_embd,n_embd)
63
- self.dropout = nn.Dropout(dropout)
64
-
65
- def forward(self, x):
66
- out = self.fc1(x)
67
- out = F.gelu(out)
68
- out = self.fc2(out)
69
- out = self.dropout(out)
70
- return out
71
-
72
- class Block(nn.Module):
73
- def __init__(self):
74
- super(Block, self).__init__()
75
- self.attn = MultiHeadAttention(n_head, n_embd // n_head)
76
- self.ff = FeedForwardLayer(n_embd)
77
- self.ln1 = nn.LayerNorm(n_embd)
78
- self.ln2 = nn.LayerNorm(n_embd)
79
- def forward(self,x):
80
- x = x + self.attn(self.ln1(x))
81
- x = x + self.ff(self.ln2(x))
82
- return x
83
-
84
- class Transformer(nn.Module):
85
- def __init__(self, n_embd, n_layer):
86
- super(Transformer, self).__init__()
87
- self.n_embd = n_embd
88
- self.n_layer = n_layer
89
- self.token_embedding = nn.Embedding(vocab, n_embd)
90
- self.position_embedding = nn.Embedding(block_size,n_embd)
91
- self.blocks = nn.Sequential(*[Block() for _ in range(n_layer)])
92
- self.ln_f = nn.LayerNorm(n_embd) # final layer norm
93
- self.ffwd = nn.Linear(n_embd, vocab)
94
-
95
- def forward(self, idx, targets=None):
96
- B,T = idx.shape
97
- x = self.token_embedding(idx) + self.position_embedding(torch.arange(T, device=idx.device))
98
- x = self.blocks(x)
99
- x = self.ln_f(x)
100
- logits = self.ffwd(x)
101
- if targets is None:
102
- loss = None
103
- else:
104
- B,T,C = logits.shape
105
- logits = logits.view(B*T, C)
106
- targets = targets.view(B*T)
107
- loss = F.cross_entropy(logits, targets, ignore_index=0)
108
- return logits,loss
109
-
110
- def generate(self, idx, max_tokens):
111
- for _ in range(max_tokens):
112
- idx_cond = idx[:, -block_size:]
113
- logits, _ = self(idx_cond)
114
- logits = logits[:,-1,:]
115
- probs = F.softmax(logits, dim=-1)
116
- idx_next = torch.multinomial(probs, num_samples=1)
117
- idx = torch.cat([idx, idx_next], dim=-1)
118
- return idx