|
model_name: molmo |
|
llm: |
|
d_model: 3584 |
|
n_heads: 28 |
|
n_kv_heads: 4 |
|
head_dim: null |
|
qkv_bias: true |
|
clip_qkv: null |
|
n_layers: 28 |
|
mlp_ratio: 4 |
|
mlp_hidden_size: 37888 |
|
activation_type: swiglu |
|
block_type: sequential |
|
rope: true |
|
rope_full_precision: true |
|
rope_theta: 1000000.0 |
|
rope_type: default |
|
rope_factor: null |
|
rope_high_freq_factor: null |
|
rope_low_freq_factor: null |
|
rope_original_max_position_embeddings: null |
|
attention_type: sdpa |
|
float32_attention: true |
|
attention_dropout: 0.0 |
|
attention_layer_norm: false |
|
attention_layer_norm_type: olmo |
|
residual_dropout: 0.1 |
|
response_residual_dropout: 0.0 |
|
layer_norm_type: rms |
|
layer_norm_with_affine: true |
|
layer_norm_eps: 1.0e-06 |
|
attention_layer_norm_with_affine: true |
|
max_sequence_length: 4096 |
|
max_position_embeddings: null |
|
include_bias: false |
|
bias_for_layer_norm: null |
|
norm_after: false |
|
moe_num_experts: 8 |
|
moe_top_k: 2 |
|
moe_mlp_impl: sparse |
|
moe_log_expert_assignment: false |
|
moe_shared_expert: false |
|
moe_lbl_in_fp32: false |
|
moe_interleave: false |
|
moe_loss_weight: 0.1 |
|
moe_zloss_weight: null |
|
moe_dropless: true |
|
moe_capacity_factor: 1.25 |
|
embedding_dropout: 0.0 |
|
scale_logits: false |
|
vocab_size: 152064 |
|
additional_vocab_size: 128 |
|
weight_tying: false |
|
embedding_size: 152064 |
|
use_position_ids: true |
|
tokenizer: |
|
identifier: Qwen/Qwen2.5-7B |
|
tokenizer_dir: null |
|
depth_tokens: true |
|
init_path: gs://mm-olmo/pretrained_llms/qwen2.5-7b.pt |
|
init_incremental: null |
|
new_embedding_init_range: 0.02 |
|
initializer_range: 0.02 |
|
normalize_input_embeds: false |
|
activation_checkpoint: whole_layer |
|
compile: blocks |
|
fix_pad_tokenizer: false |
|
resize_vocab: false |
|
init_std: 0.02 |
|
init_fn: normal |
|
init_cutoff_factor: null |
|
vision_backbone: |
|
vit: |
|
image_model_type: siglip |
|
image_default_input_size: |
|
- 378 |
|
- 378 |
|
image_patch_size: 14 |
|
image_pos_patch_size: 14 |
|
image_emb_dim: 1152 |
|
image_num_heads: 16 |
|
image_num_key_value_heads: 16 |
|
image_num_layers: 27 |
|
image_head_dim: 72 |
|
image_mlp_dim: 4304 |
|
image_mlp_activations: gelu_pytorch_tanh |
|
image_dropout_rate: 0.0 |
|
image_num_pos: 729 |
|
image_norm_eps: 1.0e-06 |
|
attention_dropout: 0.0 |
|
residual_dropout: 0.0 |
|
initializer_range: 0.02 |
|
float32_attention: true |
|
attention_type: sdpa |
|
activation_checkpointing: true |
|
init_path: gs://mm-olmo/pretrained_image_encoders/siglip2-so400m-14-384.pt |
|
resize_mode: siglip |
|
pad_value: 0.0 |
|
normalize: siglip |
|
image_pooling_2d: attention_meanq |
|
pooling_attention_mask: false |
|
image_projector: mlp |
|
image_padding_embed: null |
|
vit_layers: |
|
- -3 |
|
- -9 |
|
skip_unused_layers: true |
|
image_feature_dropout: 0.0 |
|
connector_activation_checkpointing: true |
|
compile_vit: blocks |
|
data_formatter: |
|
prompt_templates: uber_model |
|
message_format: role |
|
system_prompt: demo_or_style |
|
always_start_with_space: false |
|
default_inference_len: 65 |
|
select_answer: best |
|
debug: false |
|
image_last: false |
|
format_message_list: null |
|
p_one_message: 0.0 |
|
mm_preprocessor: |
|
crop_mode: overlap-and-resize-c2 |
|
max_crops: 8 |
|
max_images: 2 |
|
max_multi_image_crops: 8 |
|
pooling_w: 2 |
|
pooling_h: 2 |
|
overlap_margins: |
|
- 4 |
|
- 4 |
|
use_col_tokens: true |
|
loss_token_weighting: root_subsegments |
|
legacy_image_mask: false |
|
max_answer_len: null |
|
img_aug: false |
|
bi_directional_attn: null |
|
lora_enable: false |
|
lora_rank: 64 |
|
lora_alpha: 16 |
|
lora_dropout: 0.05 |
|
lora_bias: none |
|
n_action_bins: 256 |
|
norm_stats: |
|
fractal20220817_data: |
|
action: |
|
mean: |
|
- 0.006987582892179489 |
|
- 0.006265917327255011 |
|
- -0.01262515690177679 |
|
- 0.04333311319351196 |
|
- -0.005756212864071131 |
|
- 0.0009130256366916001 |
|
- 0.5354204773902893 |
|
std: |
|
- 0.0692116990685463 |
|
- 0.05970962345600128 |
|
- 0.07353084534406662 |
|
- 0.15610496699810028 |
|
- 0.13164450228214264 |
|
- 0.14593800902366638 |
|
- 0.497110515832901 |
|
max: |
|
- 2.9984593391418457 |
|
- 22.09052848815918 |
|
- 2.7507524490356445 |
|
- 1.570636510848999 |
|
- 1.5321086645126343 |
|
- 1.5691522359848022 |
|
- 1.0 |
|
min: |
|
- -2.0204520225524902 |
|
- -5.497899532318115 |
|
- -2.031663417816162 |
|
- -1.569917917251587 |
|
- -1.569892168045044 |
|
- -1.570419430732727 |
|
- 0.0 |
|
q01: |
|
- -0.22453527510166169 |
|
- -0.14820013284683228 |
|
- -0.231589707583189 |
|
- -0.3517994859814644 |
|
- -0.4193011274933815 |
|
- -0.43643461108207704 |
|
- 0.0 |
|
q99: |
|
- 0.17824687153100965 |
|
- 0.14938379630446405 |
|
- 0.21842354819178575 |
|
- 0.5892666035890578 |
|
- 0.35272657424211445 |
|
- 0.44796681255102094 |
|
- 1.0 |
|
mask: |
|
- true |
|
- true |
|
- true |
|
- true |
|
- true |
|
- true |
|
- false |
|
proprio: |
|
mean: |
|
- 0.0 |
|
- 0.0 |
|
- 0.0 |
|
- 0.0 |
|
- 0.0 |
|
- 0.0 |
|
- 0.0 |
|
std: |
|
- 0.0 |
|
- 0.0 |
|
- 0.0 |
|
- 0.0 |
|
- 0.0 |
|
- 0.0 |
|
- 0.0 |
|
max: |
|
- 0.0 |
|
- 0.0 |
|
- 0.0 |
|
- 0.0 |
|
- 0.0 |
|
- 0.0 |
|
- 0.0 |
|
min: |
|
- 0.0 |
|
- 0.0 |
|
- 0.0 |
|
- 0.0 |
|
- 0.0 |
|
- 0.0 |
|
- 0.0 |
|
q01: |
|
- 0.0 |
|
- 0.0 |
|
- 0.0 |
|
- 0.0 |
|
- 0.0 |
|
- 0.0 |
|
- 0.0 |
|
q99: |
|
- 0.0 |
|
- 0.0 |
|
- 0.0 |
|
- 0.0 |
|
- 0.0 |
|
- 0.0 |
|
- 0.0 |
|
num_transitions: 3786400 |
|
num_trajectories: 87212 |
|
|