arcAman07 commited on
Commit
9d1893a
·
1 Parent(s): a3c6bef

added entire model

Browse files
Files changed (4) hide show
  1. app.py +57 -0
  2. model.py +119 -0
  3. requirements.txt +1 -0
  4. train.py +83 -0
app.py ADDED
@@ -0,0 +1,57 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import gradio as gr
2
+ import torch
3
+ import torch.nn as nn
4
+ import torch.nn.functional as F
5
+ from model import Transformer
6
+
7
+ # hyperparameters
8
+ batch_size = 16 # how many independent sequences will we process in parallel?
9
+ block_size = 64 # what is the maximum context length for predictions?
10
+ max_iters = 5000
11
+ eval_interval = 100
12
+ learning_rate = 1e-3
13
+ device = 'cuda' if torch.cuda.is_available() else 'cpu'
14
+ eval_iters = 200
15
+ n_embd = 128
16
+ n_head = 8
17
+ n_layer = 4
18
+ dropout = 0.0
19
+ vocab = 101
20
+ # ------------
21
+
22
+ with open('/Users/deepaksharma/Documents/Python/Kaggle/GenerateKanyeLyrics/Kanye West Lyrics.txt','r',encoding='utf-8') as f:
23
+ text = f.read()
24
+
25
+ chars = sorted(list(set(text)))
26
+
27
+ stoi = {ch:i for i,ch in enumerate(chars)}
28
+ itos = {i:ch for i,ch in enumerate(chars)}
29
+
30
+ encode = lambda s: [stoi[c] for c in s]
31
+ decode = lambda l: ''.join([itos[c] for c in l])
32
+
33
+
34
+ model = Transformer(n_embd,n_layer)
35
+ model.load_state_dict(torch.load('model_weights.pth'))
36
+ model.eval()
37
+
38
+ def generate_kanye_lyrics(text, max_tokens=500):
39
+ if len(text)<64:
40
+ initial_text = ""
41
+ padding = 64-len(text)
42
+ initial_list = []
43
+ for i in range(0, padding):
44
+ initial_list.append(0)
45
+ context = initial_list + encode(text)
46
+ else:
47
+ padding = 0
48
+ initial_text = text[0:len(text)-block_size]
49
+ context = text[-block_size:]
50
+ context = encode(context)
51
+ context = torch.tensor(context, dtype=torch.long)
52
+ lyrics = torch.stack([context for _ in range(1)], dim=0)
53
+ return initial_text + decode(model.generate(lyrics, max_tokens=int(max_tokens))[0].tolist())[padding:]
54
+
55
+ demo = gr.Interface(fn=generate_kanye_lyrics, inputs=[gr.Textbox(lines=2, placeholder="Enter Starting lyrics ..."),gr.Number()], outputs="text")
56
+
57
+ demo.launch()
model.py ADDED
@@ -0,0 +1,119 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import torch
2
+ import torch.nn as nn
3
+ import torch.nn.functional as F
4
+
5
+ # hyperparameters
6
+ batch_size = 16 # how many independent sequences will we process in parallel?
7
+ block_size = 64 # what is the maximum context length for predictions?
8
+ max_iters = 5000
9
+ eval_interval = 100
10
+ learning_rate = 1e-3
11
+ device = 'cuda' if torch.cuda.is_available() else 'cpu'
12
+ eval_iters = 200
13
+ n_embd = 128
14
+ n_head = 8
15
+ n_layer = 4
16
+ dropout = 0.0
17
+ vocab = 101
18
+ # ------------
19
+
20
+
21
+ class Head(nn.Module):
22
+ def __init__(self, head_size):
23
+ super(Head,self).__init__()
24
+ self.head_size = head_size
25
+ self.dropout = nn.Dropout(dropout)
26
+ self.key = nn.Linear(n_embd, head_size, bias=False)
27
+ self.query = nn.Linear(n_embd, head_size, bias=False)
28
+ self.value = nn.Linear(n_embd, head_size, bias=False)
29
+ self.register_buffer('tril', torch.tril(torch.ones(block_size, block_size)))
30
+ def forward(self,x):
31
+ k = self.key(x)
32
+ q = self.query(x)
33
+ wei = q @ k.transpose(-2,-1) * (self.head_size ** -0.5)
34
+ wei = wei.masked_fill(self.tril == 0, float('-inf'))
35
+ wei = F.softmax(wei, dim=-1)
36
+ wei = self.dropout(wei)
37
+ v = self.value(x)
38
+ out = wei @ v
39
+ return out
40
+
41
+ class MultiHeadAttention(nn.Module):
42
+ def __init__(self, n_head, head_size):
43
+ super(MultiHeadAttention,self).__init__()
44
+ self.head_size = head_size
45
+ self.n_head = n_head
46
+ self.heads = nn.ModuleList([Head(head_size) for _ in range(n_head)])
47
+ self.out = nn.Linear(n_embd, n_embd)
48
+ self.dropout = nn.Dropout(dropout)
49
+
50
+ def forward(self,x):
51
+ out = torch.cat([h(x) for h in self.heads], dim=-1)
52
+ out = self.out(out)
53
+ out = self.dropout(out)
54
+ return out
55
+
56
+ class FeedForwardLayer(nn.Module):
57
+ def __init__(self, n_embd):
58
+ super(FeedForwardLayer, self).__init__()
59
+ self.n_embd = n_embd
60
+ self.fc1 = nn.Linear(n_embd, 4*n_embd)
61
+ self.fc2 = nn.Linear(4*n_embd,n_embd)
62
+ self.dropout = nn.Dropout(dropout)
63
+
64
+ def forward(self, x):
65
+ out = self.fc1(x)
66
+ out = F.gelu(out)
67
+ out = self.fc2(out)
68
+ out = self.dropout(out)
69
+ return out
70
+
71
+ class Block(nn.Module):
72
+ def __init__(self):
73
+ super(Block, self).__init__()
74
+ self.attn = MultiHeadAttention(n_head, n_embd // n_head)
75
+ self.ff = FeedForwardLayer(n_embd)
76
+ self.ln1 = nn.LayerNorm(n_embd)
77
+ self.ln2 = nn.LayerNorm(n_embd)
78
+ def forward(self,x):
79
+ x = x + self.attn(self.ln1(x))
80
+ x = x + self.ff(self.ln2(x))
81
+ return x
82
+
83
+ class Transformer(nn.Module):
84
+ def __init__(self, n_embd, n_layer):
85
+ super(Transformer, self).__init__()
86
+ self.n_embd = n_embd
87
+ self.n_layer = n_layer
88
+ self.token_embedding = nn.Embedding(vocab, n_embd)
89
+ self.position_embedding = nn.Embedding(block_size,n_embd)
90
+ self.blocks = nn.Sequential(*[Block() for _ in range(n_layer)])
91
+ self.ln_f = nn.LayerNorm(n_embd) # final layer norm
92
+ self.ffwd = nn.Linear(n_embd, vocab)
93
+
94
+ def forward(self, idx, targets=None):
95
+ B,T = idx.shape
96
+ x = self.token_embedding(idx) + self.position_embedding(torch.arange(T, device=idx.device))
97
+ x = self.blocks(x)
98
+ x = self.ln_f(x)
99
+ logits = self.ffwd(x)
100
+ if targets is None:
101
+ loss = None
102
+ else:
103
+ B,T,C = logits.shape
104
+ logits = logits.view(B*T, C)
105
+ targets = targets.view(B*T)
106
+ loss = F.cross_entropy(logits, targets, ignore_index=0)
107
+ return logits,loss
108
+
109
+ def generate(self, idx, max_tokens):
110
+ for _ in range(max_tokens):
111
+ idx_cond = idx[:, -block_size:]
112
+ logits, _ = self(idx_cond)
113
+ logits = logits[:,-1,:]
114
+ probs = F.softmax(logits, dim=-1)
115
+ idx_next = torch.multinomial(probs, num_samples=1)
116
+ idx = torch.cat([idx, idx_next], dim=-1)
117
+ return idx
118
+
119
+ print(torch. __version__ )
requirements.txt ADDED
@@ -0,0 +1 @@
 
 
1
+ torch==1.13.0
train.py ADDED
@@ -0,0 +1,83 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import torch
2
+ import torch.nn as nn
3
+ import torch.nn.functional as F
4
+ from model import Transformer
5
+
6
+ with open('/Users/deepaksharma/Documents/Python/Kaggle/GenerateKanyeLyrics/Kanye West Lyrics.txt','r',encoding='utf-8') as f:
7
+ text = f.read()
8
+
9
+ chars = sorted(list(set(text)))
10
+
11
+ stoi = {ch:i for i,ch in enumerate(chars)}
12
+ itos = {i:ch for i,ch in enumerate(chars)}
13
+
14
+ encode = lambda s: [stoi[c] for c in s]
15
+ decode = lambda l: ''.join([itos[c] for c in l])
16
+
17
+ data = torch.tensor(encode(text), dtype=torch.long)
18
+
19
+ n = int(0.9*len(text))
20
+ train_data = data[:n]
21
+ val_data = data[n:]
22
+
23
+ def get_batch(split):
24
+ if split == 'train':
25
+ data = train_data
26
+ elif split == 'val':
27
+ data = val_data
28
+ else:
29
+ raise ValueError("Invalid split")
30
+
31
+ ix = torch.randint(len(data)-block_size,(batch_size,))
32
+ x = torch.stack([data[i:i+block_size] for i in ix])
33
+ y = torch.stack([data[i+1:i+block_size+1] for i in ix])
34
+ return x, y
35
+
36
+ # hyperparameters
37
+ batch_size = 16 # how many independent sequences will we process in parallel?
38
+ block_size = 64 # what is the maximum context length for predictions?
39
+ max_iters = 5000
40
+ eval_interval = 100
41
+ learning_rate = 1e-3
42
+ device = 'cuda' if torch.cuda.is_available() else 'cpu'
43
+ eval_iters = 200
44
+ n_embd = 128
45
+ n_head = 8
46
+ n_layer = 4
47
+ dropout = 0.0
48
+ vocab = len(chars)
49
+ # ------------
50
+
51
+
52
+ model = Transformer(n_embd,n_layer)
53
+
54
+ print("Total params: ", sum(p.numel() for p in model.parameters()))
55
+
56
+ optimizer = torch.optim.AdamW(model.parameters(), lr=1e-3)
57
+
58
+ for steps in range(20000):
59
+ x,y = get_batch('train')
60
+ logits, loss = model(x, y)
61
+ optimizer.zero_grad()
62
+ loss.backward()
63
+ optimizer.step()
64
+ if steps % 100 == 0:
65
+ print("Step: ", steps, " Loss: ", loss.item())
66
+
67
+ # Print model's state_dict
68
+ print("Model's state_dict:")
69
+ for param_tensor in model.state_dict():
70
+ print(param_tensor, "\t", model.state_dict()[param_tensor].size())
71
+
72
+ # Print optimizer's state_dict
73
+ print("Optimizer's state_dict:")
74
+ for var_name in optimizer.state_dict():
75
+ print(var_name, "\t", optimizer.state_dict()[var_name])
76
+
77
+ torch.save(model.state_dict(), 'kanye_weights.pth')
78
+
79
+ lyrics = encode("Bitch I am back on my comma , sipping on my CocaCola, driving on a hangover ")
80
+ lyrics = torch.tensor(lyrics, dtype=torch.long)
81
+ lyrics = torch.stack([lyrics for _ in range(1)], dim=0)
82
+
83
+ print(decode(model.generate(lyrics, max_tokens=1000)[0].tolist()))