igorcheb commited on
Commit
ef32598
·
1 Parent(s): 006c840

Create training.py

Browse files
Files changed (1) hide show
  1. training.py +106 -0
training.py ADDED
@@ -0,0 +1,106 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import torch
2
+ from agent_class import ParameterisedPolicy
3
+
4
+ def create_cum_rewards(rewards, discount=DISCOUNT):
5
+ new_rews = [0]
6
+ for el in rewards[::-1]:
7
+ val = el + discount * new_rews[-1]
8
+ new_rews.append(val)
9
+ return torch.tensor(new_rews[1:][::-1], dtype=torch.float32)
10
+
11
+
12
+ def play_game(env, model, n_steps=500, render=False):
13
+ observation = env.reset()
14
+
15
+ rewards, logits = [], []
16
+ # for _ in range(n_steps):
17
+ while True:
18
+ if render:
19
+ env.render()
20
+
21
+ (mus, sigmas) = model(torch.tensor(observation, dtype=torch.float32))
22
+
23
+ m = torch.distributions.normal.Normal(mus, sigmas)
24
+ action = m.sample()
25
+ logit = m.log_prob(action)
26
+ observation, reward, done, info = env.step(action.detach().numpy())
27
+
28
+ rewards.append(reward)
29
+ logits.append(m.log_prob(action).sum())
30
+
31
+ if done:
32
+ break
33
+ env.close()
34
+
35
+ return rewards, logits
36
+
37
+ def draw_gradients_rewards(model, rewards, ep_lengths, ave_over_steps):
38
+
39
+ fig, axs = plt.subplot_mosaic([['1', '1', '2', '2'], ['3', '4', '5', '6']],
40
+ constrained_layout=False, figsize=(20, 9))
41
+
42
+ axs['1'].plot(np.array(rewards[:ave_over_steps*(len(rewards)//ave_over_steps)])\
43
+ .reshape(-1, ave_over_steps).mean(axis=-1))
44
+ axs['1'].set_title('Sum rewards per episode')
45
+
46
+ axs['1'].hlines(200, 0, len(rewards)/ave_over_steps, colors='red')
47
+ axs['1'].hlines(150, 0, len(rewards)/ave_over_steps, colors='orange')
48
+ axs['1'].hlines(0, 0, len(rewards)/ave_over_steps, colors='green')
49
+
50
+ axs['2'].plot(np.array(ep_lengths[:ave_over_steps*(len(ep_lengths)//ave_over_steps)])\
51
+ .reshape(-1, ave_over_steps).mean(axis=-1))
52
+ axs['2'].set_title('Episode length')
53
+
54
+ axs['3'].hist(model.lin_1.weight.grad.flatten().detach().numpy(), bins=50);
55
+ axs['3'].set_xlabel('Grads in dense layer 1')
56
+
57
+ axs['4'].hist(model.lin_2.weight.grad.flatten().detach().numpy(), bins=50);
58
+ axs['4'].set_xlabel('Grads in dense layer 2')
59
+
60
+ axs['5'].hist(model.lin_3.weight.grad.flatten().detach().numpy(), bins=50);
61
+ axs['5'].set_xlabel('Grads in dense layer 3')
62
+
63
+ axs['6'].hist(model.lin_4.weight.grad.flatten().detach().numpy(), bins=50);
64
+ axs['6'].set_xlabel('Grads in dense layer 4')
65
+
66
+ model = ParameterisedPolicy()
67
+ opt = torch.optim.Adam(model.parameters(), lr=0.0008)
68
+ lr_scheduler = torch.optim.lr_scheduler.StepLR(opt, step_size=4000, gamma=0.7)
69
+ rews, ep_lengths = [], []
70
+
71
+ last_max_score = 50
72
+ env = gym.make(env_name)
73
+
74
+ for _ in range(int(10e3)):
75
+ rewards, logits = play_game(env, model, render=False)
76
+
77
+ cum_rewards = create_cum_rewards(rewards, discount=DISCOUNT)
78
+ stacked_logits = torch.stack(logits).flatten()
79
+
80
+ loss = -(stacked_logits * cum_rewards).mean()
81
+
82
+ rews.append(np.sum(rewards))
83
+ ep_lengths.append(len(rewards))
84
+
85
+ opt.zero_grad()
86
+ loss.backward()
87
+ torch.nn.utils.clip_grad_norm_(model.parameters(), 50)
88
+ opt.step()
89
+ lr_scheduler.step()
90
+
91
+ if _%40 == 0:
92
+ if _ > 1:
93
+ clear_output()
94
+ draw_gradients_rewards(model, rewards=rews,
95
+ ep_lengths=ep_lengths, ave_over_steps=40)
96
+ plt.show()
97
+
98
+ if len(rews) > 40:
99
+ agg_rews = np.array(rews[-40*(len(rews)//40):])\
100
+ .reshape(-1, 40).mean(axis=-1)
101
+ if (agg_rews[-1] > last_max_score):
102
+ last_max_score = agg_rews[-1]
103
+ print('NEW BEST MODEL, STEP:', _, 'SCORE: ', last_max_score)
104
+ save_path = f'best_models/best_reinforce_lunar_lander_cont_model_{round(last_max_score,3)}.pt'
105
+ torch.save(model, save_path)
106
+