onkarpandit-g42 commited on
Commit
7616769
1 Parent(s): 0cb2bdc

Upload params_train.yaml with huggingface_hub

Browse files
Files changed (1) hide show
  1. params_train.yaml +194 -0
params_train.yaml ADDED
@@ -0,0 +1,194 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ train_input:
2
+ batch_size: 256
3
+ data_processor: GptHDF5MapDataProcessor
4
+ data_dir: /cra-406/datasets/jais_instruction_datasets/v12p2/tokenized_mlv2_2k/
5
+ num_workers: 1
6
+ persistent_workers: true
7
+ prefetch_factor: 10
8
+ repeat: true
9
+ shuffle: false
10
+ shuffle_seed: 1
11
+ use_worker_cache: false
12
+ vocab_size: 84992
13
+ eval_input:
14
+ batch_size: 32
15
+ data_dir: /cb/customers/g42/datasets/multilingual_v2/pile_val_correct_packed
16
+ data_processor: GptHDF5MapDataProcessor
17
+ num_workers: 1
18
+ repeat: false
19
+ shuffle: false
20
+ use_worker_cache: false
21
+ vocab_size: 84992
22
+ model:
23
+ mixed_precision: true
24
+ fp16_type: cbfloat16
25
+ boundary_casting: false
26
+ lora_params: null
27
+ vocab_size: 84992
28
+ embedding_layer_norm: false
29
+ embedding_dropout_rate: 0.0
30
+ share_embedding_weights: true
31
+ position_embedding_type: alibi
32
+ max_position_embeddings: 2048
33
+ position_embedding_offset: 0
34
+ num_relative_attention_buckets: 32
35
+ rotary_dim: null
36
+ rope_theta: 10000
37
+ pad_rope: false
38
+ alibi_trainable_slopes: false
39
+ pos_scaling_factor: 1.0
40
+ hidden_size: 1088
41
+ num_hidden_layers: 14
42
+ dropout_rate: 0.0
43
+ norm_type: layernorm
44
+ layer_norm_epsilon: 1.0e-05
45
+ num_heads: 17
46
+ attention_module: aiayn_attention
47
+ extra_attention_params: {}
48
+ attention_type: scaled_dot_product
49
+ attention_dropout_rate: 0.0
50
+ use_projection_bias_in_attention: true
51
+ use_ffn_bias_in_attention: true
52
+ attention_softmax_fp32: false
53
+ attention_kernel: optimized_beta
54
+ attention_sliding_window_length: null
55
+ scale_qk_dot_by_layer_idx: false
56
+ fixed_sparse_attention: null
57
+ filter_size: 2912
58
+ nonlinearity: swiglu
59
+ use_ffn_bias: true
60
+ use_bias_in_output: false
61
+ loss_scaling: num_tokens
62
+ loss_weight: 1.0
63
+ embeddings_scale: 9.1705785388303
64
+ scale_qk_dot_by_d: true
65
+ output_logits_scale: 0.2576902348606329
66
+ initializer:
67
+ name: truncated_normal
68
+ mean: 0.0
69
+ std: 0.04203434605680388
70
+ a: -0.08406869211360776
71
+ b: 0.08406869211360776
72
+ nonlinearity: null
73
+ mode: null
74
+ scale: null
75
+ distribution: null
76
+ initializer_range: 0.02
77
+ embedding_initializer:
78
+ name: truncated_normal
79
+ mean: 0.0
80
+ std: 0.0866560243479838
81
+ a: -0.1733120486959676
82
+ b: 0.1733120486959676
83
+ nonlinearity: null
84
+ mode: null
85
+ scale: null
86
+ distribution: null
87
+ output_layer_initializer:
88
+ name: truncated_normal
89
+ mean: 0.0
90
+ std: 0.007943744727823684
91
+ a: -0.015887489455647368
92
+ b: 0.015887489455647368
93
+ nonlinearity: null
94
+ mode: null
95
+ scale: null
96
+ distribution: null
97
+ compute_eval_metrics: true
98
+ sparsity: null
99
+ optimizer:
100
+ optimizer_type: AdamW
101
+ weight_decay: 0.1
102
+ log_summaries: true
103
+ loss_scaling_factor: dynamic
104
+ learning_rate:
105
+ - end_learning_rate: 0.0016
106
+ initial_learning_rate: 0.0
107
+ scheduler: Linear
108
+ total_iters: 695
109
+ - end_learning_rate: 0.00016
110
+ initial_learning_rate: 0.0016
111
+ scheduler: Linear
112
+ total_iters: 23995
113
+ max_gradient_norm: 1.0
114
+ adjust_learning_rate:
115
+ decoder_kernel: 0.23529411764705882
116
+ betas:
117
+ - 0.9
118
+ - 0.95
119
+ correct_bias: true
120
+ eps: 1.0e-08
121
+ runconfig:
122
+ steps_per_epoch: null
123
+ max_steps: 24690
124
+ mgmt_address: null
125
+ mount_dirs:
126
+ - /cra-406
127
+ num_epochs: null
128
+ python_paths:
129
+ - /cra-406/workdirs/modelzoos/rel-2.2.1/modelzoo/src
130
+ compile_dir: null
131
+ checkpoint_path: /cra-406/workdirs/240209_Jais_series_v3/artifacts/model_dir_256M/checkpoint_240320.mdl
132
+ credentials_path: null
133
+ debug_args_path: null
134
+ retrace_every_iteration: null
135
+ eval_steps: 5219
136
+ init_method: env://
137
+ job_time_sec: null
138
+ job_labels:
139
+ - Name=Neha_Sengupta
140
+ - Organization=Core42
141
+ - Model=Jais_256M
142
+ - Mode=Train
143
+ - Num_CSX=4
144
+ - Language=Bilingual
145
+ - Type=Train
146
+ - Dataset=v12p2
147
+ job_priority: p3
148
+ seed: 1
149
+ mgmt_namespace: null
150
+ load_checkpoint_states: model
151
+ target_device: CSX
152
+ mode: train
153
+ wsc_log_level: null
154
+ autoload_last_checkpoint: true
155
+ check_loss_values: true
156
+ disable_strict_checkpoint_loading: null
157
+ dist_addr: localhost:8888
158
+ dist_backend: nccl
159
+ checkpoint_steps: 8231
160
+ disable_version_check: true
161
+ drop_data: false
162
+ enable_distributed: false
163
+ model_dir: artifacts/jais_256M_v12p2_gbs256
164
+ save_initial_checkpoint: false
165
+ precision_opt_level: 1
166
+ num_workers_per_csx: 0
167
+ validate_only: null
168
+ logging: null
169
+ sync_batchnorm: false
170
+ compile_only: null
171
+ log_steps: 1
172
+ num_steps: null
173
+ transfer_processes: null
174
+ num_wgt_servers: null
175
+ num_csx: 4
176
+ num_act_servers: null
177
+ eval_frequency: null
178
+ execute_crd_memory_gi: null
179
+ compile_crd_memory_gi: null
180
+ op_profiler_config: null
181
+ dump_activations: false
182
+ log_input_summaries: false
183
+ main_process_id: 0
184
+ max_checkpoints: 100000
185
+ summary_dir: null
186
+ lazy_initialization: true
187
+ use_cstorch_optimizer_step: false
188
+ wrk_memory_gi: null
189
+ act_memory_gi: null
190
+ cmd_memory_gi: null
191
+ wgt_memory_gi: null
192
+ experimental: {}
193
+ ini: null
194
+ debug_args: null