hqfang's picture
update weights from bf16 to fp32
953e671
model_name: molmo
llm:
d_model: 3584
n_heads: 28
n_kv_heads: 4
head_dim: null
qkv_bias: true
clip_qkv: null
n_layers: 28
mlp_ratio: 4
mlp_hidden_size: 37888
activation_type: swiglu
block_type: sequential
rope: true
rope_full_precision: true
rope_theta: 1000000.0
rope_type: default
rope_factor: null
rope_high_freq_factor: null
rope_low_freq_factor: null
rope_original_max_position_embeddings: null
attention_type: sdpa
float32_attention: true
attention_dropout: 0.0
attention_layer_norm: false
attention_layer_norm_type: olmo
residual_dropout: 0.1
response_residual_dropout: 0.0
layer_norm_type: rms
layer_norm_with_affine: true
layer_norm_eps: 1.0e-06
attention_layer_norm_with_affine: true
max_sequence_length: 4096
max_position_embeddings: null
include_bias: false
bias_for_layer_norm: null
norm_after: false
moe_num_experts: 8
moe_top_k: 2
moe_mlp_impl: sparse
moe_log_expert_assignment: false
moe_shared_expert: false
moe_lbl_in_fp32: false
moe_interleave: false
moe_loss_weight: 0.1
moe_zloss_weight: null
moe_dropless: true
moe_capacity_factor: 1.25
embedding_dropout: 0.0
scale_logits: false
vocab_size: 152064
additional_vocab_size: 128
weight_tying: false
embedding_size: 152064
use_position_ids: true
tokenizer:
identifier: Qwen/Qwen2.5-7B
tokenizer_dir: null
depth_tokens: true
init_path: gs://mm-olmo/pretrained_llms/qwen2.5-7b.pt
init_incremental: null
new_embedding_init_range: 0.02
initializer_range: 0.02
normalize_input_embeds: false
activation_checkpoint: whole_layer
compile: blocks
fix_pad_tokenizer: false
resize_vocab: false
init_std: 0.02
init_fn: normal
init_cutoff_factor: null
vision_backbone:
vit:
image_model_type: siglip
image_default_input_size:
- 378
- 378
image_patch_size: 14
image_pos_patch_size: 14
image_emb_dim: 1152
image_num_heads: 16
image_num_key_value_heads: 16
image_num_layers: 27
image_head_dim: 72
image_mlp_dim: 4304
image_mlp_activations: gelu_pytorch_tanh
image_dropout_rate: 0.0
image_num_pos: 729
image_norm_eps: 1.0e-06
attention_dropout: 0.0
residual_dropout: 0.0
initializer_range: 0.02
float32_attention: true
attention_type: sdpa
activation_checkpointing: true
init_path: gs://mm-olmo/pretrained_image_encoders/siglip2-so400m-14-384.pt
resize_mode: siglip
pad_value: 0.0
normalize: siglip
image_pooling_2d: attention_meanq
pooling_attention_mask: false
image_projector: mlp
image_padding_embed: null
vit_layers:
- -3
- -9
skip_unused_layers: true
image_feature_dropout: 0.0
connector_activation_checkpointing: true
compile_vit: blocks
data_formatter:
prompt_templates: uber_model
message_format: role
system_prompt: demo_or_style
always_start_with_space: false
default_inference_len: 65
select_answer: best
debug: false
image_last: false
format_message_list: null
p_one_message: 0.0
mm_preprocessor:
crop_mode: overlap-and-resize-c2
max_crops: 8
max_images: 2
max_multi_image_crops: 8
pooling_w: 2
pooling_h: 2
overlap_margins:
- 4
- 4
use_col_tokens: true
loss_token_weighting: root_subsegments
legacy_image_mask: false
max_answer_len: null
img_aug: false
bi_directional_attn: null
lora_enable: false
lora_rank: 64
lora_alpha: 16
lora_dropout: 0.05
lora_bias: none
n_action_bins: 256
norm_stats:
fractal20220817_data:
action:
mean:
- 0.006987582892179489
- 0.006265917327255011
- -0.01262515690177679
- 0.04333311319351196
- -0.005756212864071131
- 0.0009130256366916001
- 0.5354204773902893
std:
- 0.0692116990685463
- 0.05970962345600128
- 0.07353084534406662
- 0.15610496699810028
- 0.13164450228214264
- 0.14593800902366638
- 0.497110515832901
max:
- 2.9984593391418457
- 22.09052848815918
- 2.7507524490356445
- 1.570636510848999
- 1.5321086645126343
- 1.5691522359848022
- 1.0
min:
- -2.0204520225524902
- -5.497899532318115
- -2.031663417816162
- -1.569917917251587
- -1.569892168045044
- -1.570419430732727
- 0.0
q01:
- -0.22453527510166169
- -0.14820013284683228
- -0.231589707583189
- -0.3517994859814644
- -0.4193011274933815
- -0.43643461108207704
- 0.0
q99:
- 0.17824687153100965
- 0.14938379630446405
- 0.21842354819178575
- 0.5892666035890578
- 0.35272657424211445
- 0.44796681255102094
- 1.0
mask:
- true
- true
- true
- true
- true
- true
- false
proprio:
mean:
- 0.0
- 0.0
- 0.0
- 0.0
- 0.0
- 0.0
- 0.0
std:
- 0.0
- 0.0
- 0.0
- 0.0
- 0.0
- 0.0
- 0.0
max:
- 0.0
- 0.0
- 0.0
- 0.0
- 0.0
- 0.0
- 0.0
min:
- 0.0
- 0.0
- 0.0
- 0.0
- 0.0
- 0.0
- 0.0
q01:
- 0.0
- 0.0
- 0.0
- 0.0
- 0.0
- 0.0
- 0.0
q99:
- 0.0
- 0.0
- 0.0
- 0.0
- 0.0
- 0.0
- 0.0
num_transitions: 3786400
num_trajectories: 87212