File size: 8,045 Bytes
56df21f
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
302
303
304
305
306
307
308
309
310
311
312
313
314
315
316
317
318
319
320
321
322
323
324
325
326
327
328
329
330
331
332
333
334
335
336
337
338
run_name: multitask_train
seed: 6198
epoch: null
dry_run: false
model:
  d_model: 3584
  n_heads: 28
  n_kv_heads: 4
  qkv_bias: true
  clip_qkv: null
  n_layers: 28
  mlp_ratio: 4
  mlp_hidden_size: 37888
  activation_type: swiglu
  block_type: sequential
  block_group_size: 1
  alibi: false
  alibi_bias_max: 8.0
  rope: true
  rope_full_precision: true
  rope_theta: 1000000.0
  rope_impl: llama
  vision_backbone:
    image_model_type: openai
    image_default_input_size:
    - 336
    - 336
    image_patch_size: 14
    image_pos_patch_size: 14
    image_emb_dim: 1024
    image_num_heads: 16
    image_num_key_value_heads: 16
    image_num_layers: 23
    image_head_dim: 64
    image_mlp_dim: 4096
    image_mlp_activations: quick_gelu
    image_dropout_rate: 0.0
    image_num_pos: 577
    image_norm_eps: 1.0e-05
    attention_dropout: 0.0
    residual_dropout: 0.0
    initializer_range: 0.02
    fsdp_wrap: false
    resize_mode: default
  vit_load_path: /weka/oe-training-default/mm-olmo/pretrained_image_encoders/vit-l-14-336.pt
  llm_load_path: /weka/oe-training-default/mm-olmo/pretrained_llms/qwen2-7b.pt
  low_cpu_fsdp: true
  attention_type: sdpa
  float32_attention: true
  attention_dropout: 0.0
  response_attention_dropout: 0.0
  multi_query_attention: null
  attention_layer_norm: false
  residual_dropout: 0.0
  response_residual_dropout: 0.1
  embedding_dropout: 0.0
  layer_norm_type: rms
  layer_norm_with_affine: true
  layer_norm_eps: 1.0e-06
  attention_layer_norm_with_affine: true
  max_sequence_length: 4096
  max_position_embeddings: null
  include_bias: false
  bias_for_layer_norm: null
  scale_logits: false
  vocab_size: 152064
  embedding_size: 152064
  additional_vocab_size: 128
  new_embedding_init_range: 0.02
  weight_tying: false
  pad_token_id: -1
  init_device: null
  init_fn: normal
  init_std: 0.02
  init_cutoff_factor: null
  norm_after: false
  precision: amp_bf16
  max_crops: 12
  crop_mode: overlap-and-resize-c2
  do_random_scale: false
  use_col_tokens: true
  prompt_type: none
  system_prompt_kind: style_and_length
  message_formatting: none
  always_start_with_space: true
  prompt_override: null
  default_inference_len: 65
  overlap_margins:
  - 4
  - 4
  image_padding_embed: pad_and_partial_pad
  vit_layers:
  - -2
  - -9
  image_pooling_h: 2
  image_pooling_w: 2
  image_pooling_2d: attention_meanq
  image_projector: mlp
  image_feature_dropout: 0.0
  use_cls_feature: false
  fix_image_input_idx: 2
  unconditioned: false
  pad_to: null
  initializer_range: 0.02
  pad_tokenizer: true
  normalize_input_embeds: false
  use_position_ids: true
  query_pre_attn_scalar: 224
  attn_logit_softcapping: null
  final_logit_softcapping: null
  head_dim: null
  tokenizer:
    identifier: mm:hf-Qwen/Qwen2-7B
    truncate_direction: right
    tokenizer_adds_space: false
    tokenizer_dir: null
    olmo_bos_token_id: null
    olmo_eos_token_id: null
  loss_token_weighting: null
  gin_bindings: null
ft_llm: true
ft_vit: true
ft_connector: true
ft_embedding: lm_head
optimizer:
  name: adamw
  learning_rate: 0.0001
  weight_decay: 0.01
  betas:
  - 0.9
  - 0.95
  eps: 1.0e-05
  connector_learning_rate: 0.0002
  vit_learning_rate: 6.0e-06
  llm_learning_rate: 2.0e-05
  connector_weight_decay: 0.0
  vit_weight_decay: 0.0
  llm_weight_decay: 0.0
  connector_betas:
  - 0.9
  - 0.95
  vit_betas:
  - 0.9
  - 0.95
  llm_betas:
  - 0.9
  - 0.95
  connector_eps: 1.0e-06
  vit_eps: 1.0e-06
  llm_eps: 1.0e-06
  no_decay_norm_and_bias: null
  decay_norm_and_bias: false
  decay_embeddings: false
  metrics_log_interval: 20
scheduler:
  name: multimodal
  units: steps
  t_warmup: 100
  t_max: null
  alpha_f: 0.1
  connector_t_warmup: 200
  vit_t_warmup: 2000
  llm_t_warmup: 2000
  grad_clip_warmup_steps: null
  grad_clip_warmup_factor: null
  warmup_min_lr: 0.0
data:
  multi_modal: true
  mixture_or_task_name: cockatoo_and_transcript_712k_sept6
  paths: null
  datasets: null
  label_mask_paths: null
  pad_direction: right
  generate_attention_mask: false
  num_workers: 0
  drop_last: true
  pin_memory: false
  prefetch_factor: null
  persistent_workers: false
  timeout: 0
  seed: null
  instance_filter: null
  mixture: null
  sequence_length: 2304
  shuffle: true
  for_inference: false
  split: train
  use_memory_cache: false
  num_epochs: null
  shuffle_buffer_size: 1000
  per_node_data_loader: null
restore_dataloader: true
fast_forward_batches: null
evaluators:
- label: val
  type: multi_modal_lm
  data:
    multi_modal: true
    mixture_or_task_name: cockatoo_and_transcript_712k_sept6
    paths: null
    datasets: null
    label_mask_paths: null
    pad_direction: right
    generate_attention_mask: false
    num_workers: 0
    drop_last: true
    pin_memory: false
    prefetch_factor: null
    persistent_workers: false
    timeout: 0
    seed: null
    instance_filter: null
    mixture: null
    sequence_length: 2304
    shuffle: false
    for_inference: false
    split: validation
    use_memory_cache: false
    num_epochs: null
    shuffle_buffer_size: 1000
    per_node_data_loader: null
  device_eval_batch_size: null
  subset_num_batches: 8
  max_new_tokens: 448
  mm_evaluator: null
  save_dir: null
  save_to_checkpoint_dir: false
  eval_name: null
  skip_if_metrics_cached: true
- label: caption_val
  type: multi_modal_lm
  data:
    multi_modal: true
    mixture_or_task_name: cockatoo_476k_gpt_captions
    paths: null
    datasets: null
    label_mask_paths: null
    pad_direction: right
    generate_attention_mask: false
    num_workers: 0
    drop_last: true
    pin_memory: false
    prefetch_factor: null
    persistent_workers: false
    timeout: 0
    seed: null
    instance_filter: null
    mixture: null
    sequence_length: 2304
    shuffle: false
    for_inference: false
    split: validation
    use_memory_cache: false
    num_epochs: null
    shuffle_buffer_size: 1000
    per_node_data_loader: null
  device_eval_batch_size: null
  subset_num_batches: 8
  max_new_tokens: 448
  mm_evaluator: null
  save_dir: null
  save_to_checkpoint_dir: false
  eval_name: null
  skip_if_metrics_cached: true
eval_interval: 1000
inf_eval_interval: -1
inf_evaluators: []
save_folder: /weka/oe-training-default/chrisc/cockatoo/models/dense-captioner-v22-qwen2/v2-lr2620
remote_save_folder: null
canceled_check_interval: 50
save_interval: 4000
save_interval_unsharded: 22300
save_interval_ephemeral: null
save_num_checkpoints_to_keep: 1
save_num_unsharded_checkpoints_to_keep: -1
save_overwrite: true
force_save_unsharded: false
no_pre_train_checkpoint: true
initial_model_checkpoint: null
load_model_config: null
load_path: null
load_path_sharded_checkpointer: null
reset_optimizer_state: false
reset_trainer_state: false
save_dataloader_state: false
reset_dataloader_state: false
sharded_checkpointer: torch_legacy
new_style_checkpoints: null
max_duration: 22300
global_train_batch_size: 128
device_train_batch_size: 2
device_train_microbatch_size: 4
device_eval_batch_size: 4
eval_subset_num_batches: -1
eval_on_load: false
device_inf_eval_batch_size: 16
inf_eval_subset_num_batches: -1
device_train_grad_accum: 0
max_grad_norm: 1.0
batch_divisor: global_batch
max_grad_norm_ratio: null
precision: amp_bf16
wandb:
  project: cockatoo
  entity: prior-ai2
  group: dense-captioner-v22-qwen2
  name: v2-lr2620
  tags:
  - watching
  log_artifacts: false
  rank_zero_only: true
  log_interval: 20
speed_monitor:
  window_size: 20
  gpu_flops_available: null
console_log_interval: 20
gen1_gc_interval: 1
compile: null
fsdp:
  use_orig_params: true
  sharding_strategy: FULL_SHARD
  wrapping_strategy: by_block_and_size
  precision: float
  hybrid_sharding_num_model_replicas: null
softmax_auxiliary_loss: true
softmax_auxiliary_loss_scale: 0.0001
time_limit: null
extra_steps_after_cancel: 10
early_stopping_factor: null
save_data_indices: false
python_profiling: false
torch_profiling: false
stop_at: 22300
stop_after: null
activation_checkpointing: whole_layer
fused_loss: null
tfds_dir: /weka/oe-training-default/mm-olmo/tensorflow_datasets